bpo-31979: Simplify transforming decimals to ASCII (#4336) · python/cpython@9b6c60c (original) (raw)

`@@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)

`

840

840

``

841

841

`/* --- Unicode Object ----------------------------------------------------- */

`

842

842

``

843

``

`-

static PyObject *

`

844

``

`-

fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));

`

845

``

-

846

843

`static inline Py_ssize_t

`

847

844

`findchar(const void *s, int kind,

`

848

845

`Py_ssize_t size, Py_UCS4 ch,

`

`@@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,

`

9062

9059

`return _PyUnicode_TranslateCharmap(str, mapping, errors);

`

9063

9060

`}

`

9064

9061

``

9065

``

`-

static Py_UCS4

`

9066

``

`-

fix_decimal_and_space_to_ascii(PyObject *self)

`

9067

``

`-

{

`

9068

``

`-

/* No need to call PyUnicode_READY(self) because this function is only

`

9069

``

`-

called as a callback from fixup() which does it already. */

`

9070

``

`-

const Py_ssize_t len = PyUnicode_GET_LENGTH(self);

`

9071

``

`-

const int kind = PyUnicode_KIND(self);

`

9072

``

`-

void *data = PyUnicode_DATA(self);

`

9073

``

`-

Py_UCS4 maxchar = 127, ch, fixed;

`

9074

``

`-

int modified = 0;

`

9075

``

`-

Py_ssize_t i;

`

9076

``

-

9077

``

`-

for (i = 0; i < len; ++i) {

`

9078

``

`-

ch = PyUnicode_READ(kind, data, i);

`

9079

``

`-

fixed = 0;

`

9080

``

`-

if (ch > 127) {

`

9081

``

`-

if (Py_UNICODE_ISSPACE(ch))

`

9082

``

`-

fixed = ' ';

`

9083

``

`-

else {

`

9084

``

`-

const int decimal = Py_UNICODE_TODECIMAL(ch);

`

9085

``

`-

if (decimal >= 0)

`

9086

``

`-

fixed = '0' + decimal;

`

9087

``

`-

}

`

9088

``

`-

if (fixed != 0) {

`

9089

``

`-

modified = 1;

`

9090

``

`-

maxchar = Py_MAX(maxchar, fixed);

`

9091

``

`-

PyUnicode_WRITE(kind, data, i, fixed);

`

9092

``

`-

}

`

9093

``

`-

else

`

9094

``

`-

maxchar = Py_MAX(maxchar, ch);

`

9095

``

`-

}

`

9096

``

`-

}

`

9097

``

-

9098

``

`-

return (modified) ? maxchar : 0;

`

9099

``

`-

}

`

9100

``

-

9101

9062

`PyObject *

`

9102

9063

`_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)

`

9103

9064

`{

`

`@@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)

`

9107

9068

` }

`

9108

9069

`if (PyUnicode_READY(unicode) == -1)

`

9109

9070

`return NULL;

`

9110

``

`-

if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {

`

``

9071

`+

if (PyUnicode_IS_ASCII(unicode)) {

`

9111

9072

`/* If the string is already ASCII, just return the same string */

`

9112

9073

`Py_INCREF(unicode);

`

9113

9074

`return unicode;

`

9114

9075

` }

`

9115

``

`-

return fixup(unicode, fix_decimal_and_space_to_ascii);

`

``

9076

+

``

9077

`+

Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);

`

``

9078

`+

PyObject *result = PyUnicode_New(len, 127);

`

``

9079

`+

if (result == NULL) {

`

``

9080

`+

return NULL;

`

``

9081

`+

}

`

``

9082

+

``

9083

`+

Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);

`

``

9084

`+

int kind = PyUnicode_KIND(unicode);

`

``

9085

`+

const void *data = PyUnicode_DATA(unicode);

`

``

9086

`+

Py_ssize_t i;

`

``

9087

`+

for (i = 0; i < len; ++i) {

`

``

9088

`+

Py_UCS4 ch = PyUnicode_READ(kind, data, i);

`

``

9089

`+

if (ch < 127) {

`

``

9090

`+

out[i] = ch;

`

``

9091

`+

}

`

``

9092

`+

else if (Py_UNICODE_ISSPACE(ch)) {

`

``

9093

`+

out[i] = ' ';

`

``

9094

`+

}

`

``

9095

`+

else {

`

``

9096

`+

int decimal = Py_UNICODE_TODECIMAL(ch);

`

``

9097

`+

if (decimal < 0) {

`

``

9098

`+

out[i] = '?';

`

``

9099

`+

_PyUnicode_LENGTH(result) = i + 1;

`

``

9100

`+

break;

`

``

9101

`+

}

`

``

9102

`+

out[i] = '0' + decimal;

`

``

9103

`+

}

`

``

9104

`+

}

`

``

9105

+

``

9106

`+

return result;

`

9116

9107

`}

`

9117

9108

``

9118

9109

`PyObject *

`

`@@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,

`

9588

9579

`return tailmatch(str, substr, start, end, direction);

`

9589

9580

`}

`

9590

9581

``

9591

``

`-

/* Apply fixfct filter to the Unicode object self and return a

`

9592

``

`-

reference to the modified object */

`

9593

``

-

9594

``

`-

static PyObject *

`

9595

``

`-

fixup(PyObject *self,

`

9596

``

`-

Py_UCS4 (*fixfct)(PyObject *s))

`

9597

``

`-

{

`

9598

``

`-

PyObject *u;

`

9599

``

`-

Py_UCS4 maxchar_old, maxchar_new = 0;

`

9600

``

`-

PyObject *v;

`

9601

``

-

9602

``

`-

u = _PyUnicode_Copy(self);

`

9603

``

`-

if (u == NULL)

`

9604

``

`-

return NULL;

`

9605

``

`-

maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);

`

9606

``

-

9607

``

`-

/* fix functions return the new maximum character in a string,

`

9608

``

`-

if the kind of the resulting unicode object does not change,

`

9609

``

`-

everything is fine. Otherwise we need to change the string kind

`

9610

``

`-

and re-run the fix function. */

`

9611

``

`-

maxchar_new = fixfct(u);

`

9612

``

-

9613

``

`-

if (maxchar_new == 0) {

`

9614

``

`-

/* no changes */;

`

9615

``

`-

if (PyUnicode_CheckExact(self)) {

`

9616

``

`-

Py_DECREF(u);

`

9617

``

`-

Py_INCREF(self);

`

9618

``

`-

return self;

`

9619

``

`-

}

`

9620

``

`-

else

`

9621

``

`-

return u;

`

9622

``

`-

}

`

9623

``

-

9624

``

`-

maxchar_new = align_maxchar(maxchar_new);

`

9625

``

-

9626

``

`-

if (maxchar_new == maxchar_old)

`

9627

``

`-

return u;

`

9628

``

-

9629

``

`-

/* In case the maximum character changed, we need to

`

9630

``

`-

convert the string to the new category. */

`

9631

``

`-

v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);

`

9632

``

`-

if (v == NULL) {

`

9633

``

`-

Py_DECREF(u);

`

9634

``

`-

return NULL;

`

9635

``

`-

}

`

9636

``

`-

if (maxchar_new > maxchar_old) {

`

9637

``

`-

/* If the maxchar increased so that the kind changed, not all

`

9638

``

`-

characters are representable anymore and we need to fix the

`

9639

``

`-

string again. This only happens in very few cases. */

`

9640

``

`-

_PyUnicode_FastCopyCharacters(v, 0,

`

9641

``

`-

self, 0, PyUnicode_GET_LENGTH(self));

`

9642

``

`-

maxchar_old = fixfct(v);

`

9643

``

`-

assert(maxchar_old > 0 && maxchar_old <= maxchar_new);

`

9644

``

`-

}

`

9645

``

`-

else {

`

9646

``

`-

_PyUnicode_FastCopyCharacters(v, 0,

`

9647

``

`-

u, 0, PyUnicode_GET_LENGTH(self));

`

9648

``

`-

}

`

9649

``

`-

Py_DECREF(u);

`

9650

``

`-

assert(_PyUnicode_CheckConsistency(v, 1));

`

9651

``

`-

return v;

`

9652

``

`-

}

`

9653

``

-

9654

9582

`static PyObject *

`

9655

9583

`ascii_upper_or_lower(PyObject *self, int lower)

`

9656

9584

`{

`