bpo-31979: Simplify transforming decimals to ASCII (#4336) · python/cpython@9b6c60c (original) (raw)

`@@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)

840

841

`/* --- Unicode Object ----------------------------------------------------- */

842

843

static PyObject *

844

fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));

845

-

846

843

`static inline Py_ssize_t

847

844

`findchar(const void *s, int kind,

848

845

`Py_ssize_t size, Py_UCS4 ch,

`@@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,

9062

9059

`return _PyUnicode_TranslateCharmap(str, mapping, errors);

9063

9060

9064

9061

9065

static Py_UCS4

9066

fix_decimal_and_space_to_ascii(PyObject *self)

9067

{

9068

/* No need to call PyUnicode_READY(self) because this function is only

9069

called as a callback from fixup() which does it already. */

9070

const Py_ssize_t len = PyUnicode_GET_LENGTH(self);

9071

const int kind = PyUnicode_KIND(self);

9072

void *data = PyUnicode_DATA(self);

9073

Py_UCS4 maxchar = 127, ch, fixed;

9074

int modified = 0;

9075

Py_ssize_t i;

9076

-

9077

for (i = 0; i < len; ++i) {

9078

ch = PyUnicode_READ(kind, data, i);

9079

fixed = 0;

9080

if (ch > 127) {

9081

if (Py_UNICODE_ISSPACE(ch))

9082

fixed = ' ';

9083

else {

9084

const int decimal = Py_UNICODE_TODECIMAL(ch);

9085

if (decimal >= 0)

9086

fixed = '0' + decimal;

9087

}

9088

if (fixed != 0) {

9089

modified = 1;

9090

maxchar = Py_MAX(maxchar, fixed);

9091

PyUnicode_WRITE(kind, data, i, fixed);

9092

}

9093

else

9094

maxchar = Py_MAX(maxchar, ch);

9095

}

9096

}

9097

-

9098

return (modified) ? maxchar : 0;

9099

}

9100

-

9101

9062

`PyObject *

9102

9063

`_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)

9103

9064

`@@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)

9107

9068

` }

9108

9069

`if (PyUnicode_READY(unicode) == -1)

9109

9070

`return NULL;

9110

if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {

9071

if (PyUnicode_IS_ASCII(unicode)) {

9111

9072

`/* If the string is already ASCII, just return the same string */

9112

9073

`Py_INCREF(unicode);

9113

9074

`return unicode;

9114

9075

` }

9115

return fixup(unicode, fix_decimal_and_space_to_ascii);

9076

+

9077

Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);

9078

PyObject *result = PyUnicode_New(len, 127);

9079

if (result == NULL) {

9080

return NULL;

9081

}

9082

+

9083

Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);

9084

int kind = PyUnicode_KIND(unicode);

9085

const void *data = PyUnicode_DATA(unicode);

9086

Py_ssize_t i;

9087

for (i = 0; i < len; ++i) {

9088

Py_UCS4 ch = PyUnicode_READ(kind, data, i);

9089

if (ch < 127) {

9090

out[i] = ch;

9091

}

9092

else if (Py_UNICODE_ISSPACE(ch)) {

9093

out[i] = ' ';

9094

}

9095

else {

9096

int decimal = Py_UNICODE_TODECIMAL(ch);

9097

if (decimal < 0) {

9098

out[i] = '?';

9099

_PyUnicode_LENGTH(result) = i + 1;

9100

break;

9101

}

9102

out[i] = '0' + decimal;

9103

}

9104

}

9105

+

9106

return result;

9116

9107

9117

9108

9118

9109

`PyObject *

`@@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,

9588

9579

`return tailmatch(str, substr, start, end, direction);

9589

9580

9590

9581

9591

/* Apply fixfct filter to the Unicode object self and return a

9592

reference to the modified object */

9593

-

9594

static PyObject *

9595

fixup(PyObject *self,

9596

Py_UCS4 (*fixfct)(PyObject *s))

9597

{

9598

PyObject *u;

9599

Py_UCS4 maxchar_old, maxchar_new = 0;

9600

PyObject *v;

9601

-

9602

u = _PyUnicode_Copy(self);

9603

if (u == NULL)

9604

return NULL;

9605

maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);

9606

-

9607

/* fix functions return the new maximum character in a string,

9608

if the kind of the resulting unicode object does not change,

9609

everything is fine. Otherwise we need to change the string kind

9610

and re-run the fix function. */

9611

maxchar_new = fixfct(u);

9612

-

9613

if (maxchar_new == 0) {

9614

/* no changes */;

9615

if (PyUnicode_CheckExact(self)) {

9616

Py_DECREF(u);

9617

Py_INCREF(self);

9618

return self;

9619

}

9620

else

9621

return u;

9622

}

9623

-

9624

maxchar_new = align_maxchar(maxchar_new);

9625

-

9626

if (maxchar_new == maxchar_old)

9627

return u;

9628

-

9629

/* In case the maximum character changed, we need to

9630

convert the string to the new category. */

9631

v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);

9632

if (v == NULL) {

9633

Py_DECREF(u);

9634

return NULL;

9635

}

9636

if (maxchar_new > maxchar_old) {

9637

/* If the maxchar increased so that the kind changed, not all

9638

characters are representable anymore and we need to fix the

9639

string again. This only happens in very few cases. */

9640

_PyUnicode_FastCopyCharacters(v, 0,

9641

self, 0, PyUnicode_GET_LENGTH(self));

9642

maxchar_old = fixfct(v);

9643

assert(maxchar_old > 0 && maxchar_old <= maxchar_new);

9644

}

9645

else {

9646

_PyUnicode_FastCopyCharacters(v, 0,

9647

u, 0, PyUnicode_GET_LENGTH(self));

9648

}

9649

Py_DECREF(u);

9650

assert(_PyUnicode_CheckConsistency(v, 1));

9651

return v;

9652

}

9653

-

9654

9582

`static PyObject *

9655

9583

`ascii_upper_or_lower(PyObject *self, int lower)

9656

9584