cpython: 66e2dfbb1d70 (original) (raw)

--- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -77,6 +77,8 @@ class ReTests(unittest.TestCase): self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')

self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), @@ -250,6 +252,13 @@ class ReTests(unittest.TestCase): [b'', b'a', b'b', b'c']) self.assertTypedEqual(re.split(b"(:*)", string), [b'', b':', b'a', b':', b'b', b'::', b'c'])

self.assertEqual(re.split("(?::)", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(re.split("(:)", ":a:b::c"), @@ -287,6 +296,14 @@ class ReTests(unittest.TestCase): [b":", b"::", b":::"]) self.assertTypedEqual(re.findall(b"(:)(:*)", string), [(b":", b""), (b":", b":"), (b":", b"::")])

def test_bug_117612(self): self.assertEqual(re.findall(r"(a|(b))", "aba"), @@ -305,6 +322,12 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(b'(a)', string).group(0), b'a') self.assertEqual(re.match(b'(a)', string).group(1), b'a') self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))

pat = re.compile('((a)|(b))(c)?') self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -21,6 +21,8 @@ Core and Builtins Library ------- +- Issue #18685: Restore re performance to pre-PEP 393 levels. +

--- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -46,6 +46,8 @@ static char copyright[] = #include "sre.h" +#define SRE_CODE_BITS (8 * sizeof(SRE_CODE)) + #include <ctype.h> /* name of this module, minus the leading underscore / @@ -58,9 +60,6 @@ static char copyright[] = / defining this one enables tracing / #undef VERBOSE -/ defining this enables unicode support (default under 1.6a1 and later) / -#define HAVE_UNICODE - / -------------------------------------------------------------------- / / optional features / @@ -146,9 +145,6 @@ static unsigned int sre_lower(unsigned i / locale-specific character predicates / / !(c & ~N) == (c < N+1) for any unsigned c, this avoids

#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '') @@ -252,55 +248,39 @@ data_stack_grow(SRE_STATE* state, Py_ssi /* generate 8-bit version / -#define SRE_CHAR unsigned char -#define SRE_CHARGET(state, buf, index) ((unsigned char)buf)[index] -#define SRE_AT sre_at -#define SRE_COUNT sre_count -#define SRE_CHARSET sre_charset -#define SRE_INFO sre_info -#define SRE_MATCH sre_match -#define SRE_MATCH_CONTEXT sre_match_context -#define SRE_SEARCH sre_search - +#define SRE_CHAR Py_UCS1 +#define SIZEOF_SRE_CHAR 1 +#define SRE(F) sre_ucs1##F +#define SRE_RECURSIVE +#include "sre.c" + +/* generate 16-bit unicode version */ + +#define SRE_CHAR Py_UCS2 +#define SIZEOF_SRE_CHAR 2 +#define SRE(F) sre_ucs2##F #define SRE_RECURSIVE #include "_sre.c" -#undef SRE_RECURSIVE - -#undef SRE_SEARCH -#undef SRE_MATCH -#undef SRE_MATCH_CONTEXT -#undef SRE_INFO -#undef SRE_CHARSET -#undef SRE_COUNT -#undef SRE_AT -#undef SRE_CHAR -#undef SRE_CHARGET - -/* generate 8/16/32-bit unicode version */ - -#define SRE_CHAR void -#define SRE_CHARGET(state, buf, index) [](#l3.74)

-#define SRE_AT sre_uat -#define SRE_COUNT sre_ucount -#define SRE_CHARSET sre_ucharset -#define SRE_INFO sre_uinfo -#define SRE_MATCH sre_umatch -#define SRE_MATCH_CONTEXT sre_umatch_context -#define SRE_SEARCH sre_usearch + +/* generate 32-bit unicode version / + +#define SRE_CHAR Py_UCS4 +#define SIZEOF_SRE_CHAR 4 +#define SRE(F) sre_ucs4_##F +#define SRE_RECURSIVE +#include "_sre.c" #endif / SRE_RECURSIVE / +#ifdef SRE_RECURSIVE / -------------------------------------------------------------------- / / String matching engine / -/ the following section is compiled twice, with different character +/* the following section is compiled three times, with different character settings / LOCAL(int) -SRE_AT(SRE_STATE state, char* ptr, SRE_CODE at) +SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) { /* check if pointer is at given position / @@ -314,16 +294,16 @@ SRE_AT(SRE_STATE state, char* ptr, SRE_ case SRE_AT_BEGINNING_LINE: return ((void*) ptr == state->beginning ||

case SRE_AT_END:

case SRE_AT_END_LINE: return ((void*) ptr == state->end ||

case SRE_AT_END_STRING: return ((void*) ptr == state->end); @@ -332,54 +312,54 @@ SRE_AT(SRE_STATE* state, char* ptr, SRE_ if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?

case SRE_AT_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?

case SRE_AT_LOC_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?

case SRE_AT_LOC_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?

case SRE_AT_UNI_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?

case SRE_AT_UNI_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?

} @@ -388,7 +368,7 @@ SRE_AT(SRE_STATE* state, char* ptr, SRE_ } LOCAL(int) -SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) +SRE(charset)(SRE_CODE* set, SRE_CODE ch) { /* check if character is a member of the given set / @@ -411,22 +391,15 @@ SRE_CHARSET(SRE_CODE set, SRE_CODE ch) /* */ if (sre_category(set[0], (int) ch)) return ok;

case SRE_OP_CHARSET:

case SRE_OP_RANGE: @@ -446,26 +419,16 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) Py_ssize_t count, block; count = *(set++);

@@ -477,35 +440,35 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) } } -LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern); +LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern); LOCAL(Py_ssize_t) -SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) +SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) { SRE_CODE chr;

switch (pattern[0]) { case SRE_OP_IN: /* repeated set */ TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));

case SRE_OP_ANY: /* repeated dot wildcard. */ TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));

case SRE_OP_ANY_ALL: @@ -519,75 +482,87 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pa /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));

+#if SIZEOF_SRE_CHAR < 4

+#endif

case SRE_OP_LITERAL_IGNORE: /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));

case SRE_OP_NOT_LITERAL: /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));

+#if SIZEOF_SRE_CHAR < 4

+#endif

case SRE_OP_NOT_LITERAL_IGNORE: /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));

default: /* repeated single character pattern */ TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));

} #if 0 /* not used in this release / LOCAL(int) -SRE_INFO(SRE_STATE state, SRE_CODE* pattern) +SRE(info)(SRE_STATE* state, SRE_CODE* pattern) { /* check if an SRE_OP_INFO block matches at the current position. returns the number of SRE_CODE objects to skip if successful, 0 if no match */

/* check known prefix / if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { / */ for (i = 0; i < pattern[5]; i++)

#endif -/* The macros below should be used to protect recursive SRE_MATCH() +/* The macros below should be used to protect recursive SRE(match)()

#define JUMP_ASSERT_NOT 13 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) [](#l3.487)

TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));

if (ctx->pattern[0] == SRE_OP_INFO) { /* optimization info block / / <1=skip> <2=flags> <3=min> ... */

@@ -844,10 +818,10 @@ entrance: /* */ TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));

case SRE_OP_NOT_LITERAL: @@ -855,10 +829,10 @@ entrance: /* */ TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));

case SRE_OP_SUCCESS: @@ -871,7 +845,7 @@ entrance: /* match at given position / / */ TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));

@@ -881,19 +855,19 @@ entrance: /* */ TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));

case SRE_OP_ANY: /* match anything (except a newline) / / */ TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));

case SRE_OP_ANY_ALL: @@ -902,47 +876,47 @@ entrance: TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end) RETURN_FAILURE;

case SRE_OP_IN: /* match set member (or non_member) / / */ TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));

case SRE_OP_LITERAL_IGNORE: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end ||

case SRE_OP_NOT_LITERAL_IGNORE: TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end ||

case SRE_OP_IN_IGNORE: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end

case SRE_OP_JUMP: @@ -965,11 +939,11 @@ entrance: for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { if (ctx->pattern[1] == SRE_OP_LITERAL && (ctx->ptr >= end ||

@@ -1000,16 +974,16 @@ entrance: TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2]));

state->ptr = ctx->ptr;

/* when we arrive here, count contains the number of matches, and ctx->ptr points to the tail of the target @@ -1033,9 +1007,8 @@ entrance: ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; for (;;) { while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&

@@ -1050,7 +1023,7 @@ entrance: LASTMARK_RESTORE();

@@ -1064,7 +1037,7 @@ entrance: RETURN_ON_ERROR(ret); RETURN_SUCCESS; }

@@ -1084,7 +1057,7 @@ entrance: TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2]));

state->ptr = ctx->ptr; @@ -1093,15 +1066,15 @@ entrance: ctx->count = 0; else { /* count using pattern min as the maximum */

if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { @@ -1122,13 +1095,13 @@ entrance: RETURN_SUCCESS; } state->ptr = ctx->ptr;

@@ -1305,16 +1278,15 @@ entrance: if (groupref >= state->lastmark) { RETURN_FAILURE; } else {

@@ -1331,17 +1303,16 @@ entrance: if (groupref >= state->lastmark) { RETURN_FAILURE; } else {

@@ -1375,7 +1346,7 @@ entrance: /* */ TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1]));

@@ -1388,7 +1359,7 @@ entrance: /* */ TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1]));

@@ -1417,7 +1388,7 @@ exit: DATA_POP_DISCARD(ctx); if (ctx_pos == -1) return ret;

switch (jump) { case JUMP_MAX_UNTIL_2: @@ -1469,10 +1440,10 @@ exit: } LOCAL(Py_ssize_t) -SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) +SRE(search)(SRE_STATE* state, SRE_CODE* pattern) {

if (flags & SRE_INFO_PREFIX) { @@ -1519,32 +1490,47 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* p /* pattern starts with a known prefix. use the overlap table to skip forward as fast as we possibly can */ Py_ssize_t i = 0;

+

+#if SIZEOF_SRE_CHAR < 4

+#endif while (ptr < end) {

+

+#if SIZEOF_SRE_CHAR < 4

+#endif

@@ -1600,7 +1588,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* p return status; } -#if !defined(SRE_RECURSIVE) +#endif /* SRE_RECURSIVE / + +#ifndef SRE_RECURSIVE / -------------------------------------------------------------------- / / factories and destructors / @@ -1609,23 +1599,6 @@ SRE_SEARCH(SRE_STATE state, SRE_CODE* p static PyObjectpattern_new_match(PatternObject, SRE_STATE*, int); static PyObjectpattern_scanner(PatternObject, PyObject*, PyObject* kw); -static int -sre_literal_template(int charsize, char* ptr, Py_ssize_t len) -{

-} - static PyObject sre_codesize(PyObject self, PyObject unused) { @@ -1661,72 +1634,41 @@ state_reset(SRE_STATE state) static void* getstring(PyObject* string, Py_ssize_t* p_length,

{ /* given a python object, return a data pointer, a length (in characters), and a character size. return NULL if the object is not a string (or not compatible) */

- /* Unicode objects do not support the buffer API. So, get the data directly instead. */ if (PyUnicode_Check(string)) { if (PyUnicode_READY(string) == -1) return NULL;

-

-

-

+

-

} LOCAL(PyObject*) @@ -1736,7 +1678,7 @@ state_init(SRE_STATE* state, PatternObje /* prepare state object */ Py_ssize_t length;

@@ -1771,7 +1713,7 @@ state_init(SRE_STATE* state, PatternObje else if (end > length) end = length;

{

@@ -1849,7 +1791,7 @@ state_getslice(SRE_STATE* state, Py_ssiz j = STATE_OFFSET(state, state->mark[index+1]); }

} static void @@ -1882,14 +1824,34 @@ pattern_dealloc(PatternObject* self) { if (self->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *) self);

+} + +LOCAL(Py_ssize_t) +sre_search(SRE_STATE* state, SRE_CODE* pattern) +{

+} + static PyObject* pattern_match(PatternObject* self, PyObject* args, PyObject* kw) { @@ -1912,11 +1874,7 @@ pattern_match(PatternObject* self, PyObj TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));

TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) @@ -1947,11 +1905,7 @@ pattern_search(PatternObject* self, PyOb TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));

TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); @@ -2044,12 +1998,7 @@ pattern_findall(PatternObject* self, PyO state.ptr = state.start;

-

@@ -2065,7 +2014,7 @@ pattern_findall(PatternObject* self, PyO case 0: b = STATE_OFFSET(&state, state.start); e = STATE_OFFSET(&state, state.ptr);

@@ -2171,12 +2120,7 @@ pattern_split(PatternObject* self, PyObj state.ptr = state.start;

-

@@ -2196,7 +2140,7 @@ pattern_split(PatternObject* self, PyObj } /* get segment before this match */

@@ -2225,7 +2169,7 @@ pattern_split(PatternObject* self, PyObj } /* get segment following last match (even if empty) */

@@ -2320,12 +2267,7 @@ pattern_subx(PatternObject* self, PyObje state.ptr = state.start;

-

@@ -2341,7 +2283,7 @@ pattern_subx(PatternObject* self, PyObje if (i < b) { /* get segment before this match */

@@ -2397,7 +2339,7 @@ next: /* get segment following last match */ if (i < state.endpos) {

@@ -2412,7 +2354,7 @@ next: Py_DECREF(filter); /* convert list to single string (also removes list) */

@@ -2422,7 +2364,7 @@ next: item = joiner; } else {

@@ -2652,7 +2594,6 @@ static PyObject * self->pattern = NULL; self->groupindex = NULL; self->indexgroup = NULL;

self->codesize = n; @@ -2673,16 +2614,20 @@ static PyObject * } if (pattern == Py_None) {

case SRE_OP_CHARSET:

@@ -2818,7 +2763,7 @@ static int FAIL; } code += offset;

@@ -3188,7 +3133,7 @@ static PyObject* match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) { Py_ssize_t length;

@@ -3790,11 +3735,7 @@ scanner_match(ScannerObject* self, PyObj state->ptr = state->start;

@@ -3821,11 +3762,7 @@ scanner_search(ScannerObject* self, PyOb state->ptr = state->start;

@@ -3980,5 +3917,12 @@ PyMODINIT_FUNC PyInit__sre(void) #endif /* !defined(SRE_RECURSIVE) / +#ifdef SRE_RECURSIVE +# undef SRE_RECURSIVE +# undef SRE_CHAR +# undef SIZEOF_SRE_CHAR +# undef SRE +#endif / SRE_RECURSIVE / + / vim:ts=4:sw=4:et */

--- a/Modules/sre.h +++ b/Modules/sre.h @@ -31,9 +31,7 @@ typedef struct { PyObject* pattern; /* pattern source (or None) / int flags; / flags used when compiling pattern source */ PyObject weakreflist; / List of weak references */