bpo-31979: Simplify transforming decimals to ASCII (#4336) · python/cpython@9b6c60c (original) (raw)
`@@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
`
840
840
``
841
841
`/* --- Unicode Object ----------------------------------------------------- */
`
842
842
``
843
``
`-
static PyObject *
`
844
``
`-
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
`
845
``
-
846
843
`static inline Py_ssize_t
`
847
844
`findchar(const void *s, int kind,
`
848
845
`Py_ssize_t size, Py_UCS4 ch,
`
`@@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
`
9062
9059
`return _PyUnicode_TranslateCharmap(str, mapping, errors);
`
9063
9060
`}
`
9064
9061
``
9065
``
`-
static Py_UCS4
`
9066
``
`-
fix_decimal_and_space_to_ascii(PyObject *self)
`
9067
``
`-
{
`
9068
``
`-
/* No need to call PyUnicode_READY(self) because this function is only
`
9069
``
`-
called as a callback from fixup() which does it already. */
`
9070
``
`-
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
`
9071
``
`-
const int kind = PyUnicode_KIND(self);
`
9072
``
`-
void *data = PyUnicode_DATA(self);
`
9073
``
`-
Py_UCS4 maxchar = 127, ch, fixed;
`
9074
``
`-
int modified = 0;
`
9075
``
`-
Py_ssize_t i;
`
9076
``
-
9077
``
`-
for (i = 0; i < len; ++i) {
`
9078
``
`-
ch = PyUnicode_READ(kind, data, i);
`
9079
``
`-
fixed = 0;
`
9080
``
`-
if (ch > 127) {
`
9081
``
`-
if (Py_UNICODE_ISSPACE(ch))
`
9082
``
`-
fixed = ' ';
`
9083
``
`-
else {
`
9084
``
`-
const int decimal = Py_UNICODE_TODECIMAL(ch);
`
9085
``
`-
if (decimal >= 0)
`
9086
``
`-
fixed = '0' + decimal;
`
9087
``
`-
}
`
9088
``
`-
if (fixed != 0) {
`
9089
``
`-
modified = 1;
`
9090
``
`-
maxchar = Py_MAX(maxchar, fixed);
`
9091
``
`-
PyUnicode_WRITE(kind, data, i, fixed);
`
9092
``
`-
}
`
9093
``
`-
else
`
9094
``
`-
maxchar = Py_MAX(maxchar, ch);
`
9095
``
`-
}
`
9096
``
`-
}
`
9097
``
-
9098
``
`-
return (modified) ? maxchar : 0;
`
9099
``
`-
}
`
9100
``
-
9101
9062
`PyObject *
`
9102
9063
`_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
`
9103
9064
`{
`
`@@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
`
9107
9068
` }
`
9108
9069
`if (PyUnicode_READY(unicode) == -1)
`
9109
9070
`return NULL;
`
9110
``
`-
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
`
``
9071
`+
if (PyUnicode_IS_ASCII(unicode)) {
`
9111
9072
`/* If the string is already ASCII, just return the same string */
`
9112
9073
`Py_INCREF(unicode);
`
9113
9074
`return unicode;
`
9114
9075
` }
`
9115
``
`-
return fixup(unicode, fix_decimal_and_space_to_ascii);
`
``
9076
+
``
9077
`+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
`
``
9078
`+
PyObject *result = PyUnicode_New(len, 127);
`
``
9079
`+
if (result == NULL) {
`
``
9080
`+
return NULL;
`
``
9081
`+
}
`
``
9082
+
``
9083
`+
Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
`
``
9084
`+
int kind = PyUnicode_KIND(unicode);
`
``
9085
`+
const void *data = PyUnicode_DATA(unicode);
`
``
9086
`+
Py_ssize_t i;
`
``
9087
`+
for (i = 0; i < len; ++i) {
`
``
9088
`+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
`
``
9089
`+
if (ch < 127) {
`
``
9090
`+
out[i] = ch;
`
``
9091
`+
}
`
``
9092
`+
else if (Py_UNICODE_ISSPACE(ch)) {
`
``
9093
`+
out[i] = ' ';
`
``
9094
`+
}
`
``
9095
`+
else {
`
``
9096
`+
int decimal = Py_UNICODE_TODECIMAL(ch);
`
``
9097
`+
if (decimal < 0) {
`
``
9098
`+
out[i] = '?';
`
``
9099
`+
_PyUnicode_LENGTH(result) = i + 1;
`
``
9100
`+
break;
`
``
9101
`+
}
`
``
9102
`+
out[i] = '0' + decimal;
`
``
9103
`+
}
`
``
9104
`+
}
`
``
9105
+
``
9106
`+
return result;
`
9116
9107
`}
`
9117
9108
``
9118
9109
`PyObject *
`
`@@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
`
9588
9579
`return tailmatch(str, substr, start, end, direction);
`
9589
9580
`}
`
9590
9581
``
9591
``
`-
/* Apply fixfct filter to the Unicode object self and return a
`
9592
``
`-
reference to the modified object */
`
9593
``
-
9594
``
`-
static PyObject *
`
9595
``
`-
fixup(PyObject *self,
`
9596
``
`-
Py_UCS4 (*fixfct)(PyObject *s))
`
9597
``
`-
{
`
9598
``
`-
PyObject *u;
`
9599
``
`-
Py_UCS4 maxchar_old, maxchar_new = 0;
`
9600
``
`-
PyObject *v;
`
9601
``
-
9602
``
`-
u = _PyUnicode_Copy(self);
`
9603
``
`-
if (u == NULL)
`
9604
``
`-
return NULL;
`
9605
``
`-
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
`
9606
``
-
9607
``
`-
/* fix functions return the new maximum character in a string,
`
9608
``
`-
if the kind of the resulting unicode object does not change,
`
9609
``
`-
everything is fine. Otherwise we need to change the string kind
`
9610
``
`-
and re-run the fix function. */
`
9611
``
`-
maxchar_new = fixfct(u);
`
9612
``
-
9613
``
`-
if (maxchar_new == 0) {
`
9614
``
`-
/* no changes */;
`
9615
``
`-
if (PyUnicode_CheckExact(self)) {
`
9616
``
`-
Py_DECREF(u);
`
9617
``
`-
Py_INCREF(self);
`
9618
``
`-
return self;
`
9619
``
`-
}
`
9620
``
`-
else
`
9621
``
`-
return u;
`
9622
``
`-
}
`
9623
``
-
9624
``
`-
maxchar_new = align_maxchar(maxchar_new);
`
9625
``
-
9626
``
`-
if (maxchar_new == maxchar_old)
`
9627
``
`-
return u;
`
9628
``
-
9629
``
`-
/* In case the maximum character changed, we need to
`
9630
``
`-
convert the string to the new category. */
`
9631
``
`-
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
`
9632
``
`-
if (v == NULL) {
`
9633
``
`-
Py_DECREF(u);
`
9634
``
`-
return NULL;
`
9635
``
`-
}
`
9636
``
`-
if (maxchar_new > maxchar_old) {
`
9637
``
`-
/* If the maxchar increased so that the kind changed, not all
`
9638
``
`-
characters are representable anymore and we need to fix the
`
9639
``
`-
string again. This only happens in very few cases. */
`
9640
``
`-
_PyUnicode_FastCopyCharacters(v, 0,
`
9641
``
`-
self, 0, PyUnicode_GET_LENGTH(self));
`
9642
``
`-
maxchar_old = fixfct(v);
`
9643
``
`-
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
`
9644
``
`-
}
`
9645
``
`-
else {
`
9646
``
`-
_PyUnicode_FastCopyCharacters(v, 0,
`
9647
``
`-
u, 0, PyUnicode_GET_LENGTH(self));
`
9648
``
`-
}
`
9649
``
`-
Py_DECREF(u);
`
9650
``
`-
assert(_PyUnicode_CheckConsistency(v, 1));
`
9651
``
`-
return v;
`
9652
``
`-
}
`
9653
``
-
9654
9582
`static PyObject *
`
9655
9583
`ascii_upper_or_lower(PyObject *self, int lower)
`
9656
9584
`{
`