cpython: 66e2dfbb1d70 (original) (raw)
--- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -77,6 +77,8 @@ class ReTests(unittest.TestCase): self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
for y in ("\xe0", "\u0430", "\U0001d49c"):[](#l1.7)
self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')[](#l1.8)
self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), @@ -250,6 +252,13 @@ class ReTests(unittest.TestCase): [b'', b'a', b'b', b'c']) self.assertTypedEqual(re.split(b"(:*)", string), [b'', b':', b'a', b':', b'b', b'::', b'c'])
for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",[](#l1.16)
"\U0001d49c\U0001d49e\U0001d4b5"):[](#l1.17)
string = ":%s:%s::%s" % (a, b, c)[](#l1.18)
self.assertEqual(re.split(":", string), ['', a, b, '', c])[](#l1.19)
self.assertEqual(re.split(":*", string), ['', a, b, c])[](#l1.20)
self.assertEqual(re.split("(:*)", string),[](#l1.21)
['', ':', a, ':', b, '::', c])[](#l1.22)
self.assertEqual(re.split("(?::)", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(re.split("(:)", ":a:b::c"), @@ -287,6 +296,14 @@ class ReTests(unittest.TestCase): [b":", b"::", b":::"]) self.assertTypedEqual(re.findall(b"(:)(:*)", string), [(b":", b""), (b":", b":"), (b":", b"::")])
for x in ("\xe0", "\u0430", "\U0001d49c"):[](#l1.30)
xx = x * 2[](#l1.31)
xxx = x * 3[](#l1.32)
string = "a%sb%sc%sd" % (x, xx, xxx)[](#l1.33)
self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])[](#l1.34)
self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])[](#l1.35)
self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),[](#l1.36)
[(x, ""), (x, x), (x, xx)])[](#l1.37)
def test_bug_117612(self): self.assertEqual(re.findall(r"(a|(b))", "aba"), @@ -305,6 +322,12 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(b'(a)', string).group(0), b'a') self.assertEqual(re.match(b'(a)', string).group(1), b'a') self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
for a in ("\xe0", "\u0430", "\U0001d49c"):[](#l1.45)
self.assertEqual(re.match(a, a).groups(), ())[](#l1.46)
self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))[](#l1.47)
self.assertEqual(re.match('(%s)' % a, a).group(0), a)[](#l1.48)
self.assertEqual(re.match('(%s)' % a, a).group(1), a)[](#l1.49)
self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))[](#l1.50)
pat = re.compile('((a)|(b))(c)?') self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -21,6 +21,8 @@ Core and Builtins Library ------- +- Issue #18685: Restore re performance to pre-PEP 393 levels. +
--- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -46,6 +46,8 @@ static char copyright[] = #include "sre.h" +#define SRE_CODE_BITS (8 * sizeof(SRE_CODE)) + #include <ctype.h> /* name of this module, minus the leading underscore / @@ -58,9 +60,6 @@ static char copyright[] = / defining this one enables tracing / #undef VERBOSE -/ defining this enables unicode support (default under 1.6a1 and later) / -#define HAVE_UNICODE - / -------------------------------------------------------------------- / / optional features / @@ -146,9 +145,6 @@ static unsigned int sre_lower(unsigned i / locale-specific character predicates / / !(c & ~N) == (c < N+1) for any unsigned c, this avoids
- warnings when c's type supports only numbers < N+1 */ -#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0) -#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0) -#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0) #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '') @@ -252,55 +248,39 @@ data_stack_grow(SRE_STATE* state, Py_ssi /* generate 8-bit version / -#define SRE_CHAR unsigned char -#define SRE_CHARGET(state, buf, index) ((unsigned char)buf)[index] -#define SRE_AT sre_at -#define SRE_COUNT sre_count -#define SRE_CHARSET sre_charset -#define SRE_INFO sre_info -#define SRE_MATCH sre_match -#define SRE_MATCH_CONTEXT sre_match_context -#define SRE_SEARCH sre_search - +#define SRE_CHAR Py_UCS1 +#define SIZEOF_SRE_CHAR 1 +#define SRE(F) sre_ucs1##F +#define SRE_RECURSIVE +#include "sre.c" + +/* generate 16-bit unicode version */ + +#define SRE_CHAR Py_UCS2 +#define SIZEOF_SRE_CHAR 2 +#define SRE(F) sre_ucs2##F #define SRE_RECURSIVE #include "_sre.c" -#undef SRE_RECURSIVE - -#undef SRE_SEARCH -#undef SRE_MATCH -#undef SRE_MATCH_CONTEXT -#undef SRE_INFO -#undef SRE_CHARSET -#undef SRE_COUNT -#undef SRE_AT -#undef SRE_CHAR -#undef SRE_CHARGET - -/* generate 8/16/32-bit unicode version */ - -#define SRE_CHAR void -#define SRE_CHARGET(state, buf, index) [](#l3.74)
- ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : [](#l3.75)
(state->charsize==2) ? ((Py_UCS2*)buf)[index] : \[](#l3.76)
((Py_UCS4*)buf)[index])[](#l3.77)
-#define SRE_AT sre_uat -#define SRE_COUNT sre_ucount -#define SRE_CHARSET sre_ucharset -#define SRE_INFO sre_uinfo -#define SRE_MATCH sre_umatch -#define SRE_MATCH_CONTEXT sre_umatch_context -#define SRE_SEARCH sre_usearch + +/* generate 32-bit unicode version / + +#define SRE_CHAR Py_UCS4 +#define SIZEOF_SRE_CHAR 4 +#define SRE(F) sre_ucs4_##F +#define SRE_RECURSIVE +#include "_sre.c" #endif / SRE_RECURSIVE / +#ifdef SRE_RECURSIVE / -------------------------------------------------------------------- / / String matching engine / -/ the following section is compiled twice, with different character +/* the following section is compiled three times, with different character settings / LOCAL(int) -SRE_AT(SRE_STATE state, char* ptr, SRE_CODE at) +SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) { /* check if pointer is at given position / @@ -314,16 +294,16 @@ SRE_AT(SRE_STATE state, char* ptr, SRE_ case SRE_AT_BEGINNING_LINE: return ((void*) ptr == state->beginning ||
SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));[](#l3.114)
SRE_IS_LINEBREAK((int) ptr[-1]));[](#l3.115)
return (((void*) (ptr+state->charsize) == state->end &&[](#l3.118)
SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||[](#l3.119)
return (((void*) (ptr+1) == state->end &&[](#l3.120)
SRE_IS_LINEBREAK((int) ptr[0])) ||[](#l3.121) ((void*) ptr == state->end));[](#l3.122)
case SRE_AT_END_LINE: return ((void*) ptr == state->end ||
SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));[](#l3.126)
SRE_IS_LINEBREAK((int) ptr[0]));[](#l3.127)
case SRE_AT_END_STRING: return ((void*) ptr == state->end); @@ -332,54 +312,54 @@ SRE_AT(SRE_STATE* state, char* ptr, SRE_ if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?
SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;[](#l3.135)
SRE_IS_WORD((int) ptr[-1]) : 0;[](#l3.136) thisp = ((void*) ptr < state->end) ?[](#l3.137)
SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;[](#l3.138)
SRE_IS_WORD((int) ptr[0]) : 0;[](#l3.139) return thisp != thatp;[](#l3.140)
case SRE_AT_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?
SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;[](#l3.146)
SRE_IS_WORD((int) ptr[-1]) : 0;[](#l3.147) thisp = ((void*) ptr < state->end) ?[](#l3.148)
SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;[](#l3.149)
SRE_IS_WORD((int) ptr[0]) : 0;[](#l3.150) return thisp == thatp;[](#l3.151)
case SRE_AT_LOC_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?
SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;[](#l3.157)
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;[](#l3.158) thisp = ((void*) ptr < state->end) ?[](#l3.159)
SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;[](#l3.160)
SRE_LOC_IS_WORD((int) ptr[0]) : 0;[](#l3.161) return thisp != thatp;[](#l3.162)
case SRE_AT_LOC_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?
SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;[](#l3.168)
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;[](#l3.169) thisp = ((void*) ptr < state->end) ?[](#l3.170)
SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;[](#l3.171)
SRE_LOC_IS_WORD((int) ptr[0]) : 0;[](#l3.172) return thisp == thatp;[](#l3.173)
case SRE_AT_UNI_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?
SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;[](#l3.179)
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;[](#l3.180) thisp = ((void*) ptr < state->end) ?[](#l3.181)
SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;[](#l3.182)
SRE_UNI_IS_WORD((int) ptr[0]) : 0;[](#l3.183) return thisp != thatp;[](#l3.184)
case SRE_AT_UNI_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ?
SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;[](#l3.190)
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;[](#l3.191) thisp = ((void*) ptr < state->end) ?[](#l3.192)
SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;[](#l3.193)
SRE_UNI_IS_WORD((int) ptr[0]) : 0;[](#l3.194) return thisp == thatp;[](#l3.195)
}
@@ -388,7 +368,7 @@ SRE_AT(SRE_STATE* state, char* ptr, SRE_
}
LOCAL(int)
-SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
+SRE(charset)(SRE_CODE* set, SRE_CODE ch)
{
/* check if character is a member of the given set /
@@ -411,22 +391,15 @@ SRE_CHARSET(SRE_CODE set, SRE_CODE ch)
/* */
if (sre_category(set[0], (int) ch))
return ok;
set += 1;[](#l3.211)
set++;[](#l3.212) break;[](#l3.213)
if (sizeof(SRE_CODE) == 2) {[](#l3.216)
/* <CHARSET> <bitmap> (16 bits per code word) */[](#l3.217)
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))[](#l3.218)
return ok;[](#l3.219)
set += 16;[](#l3.220)
}[](#l3.221)
else {[](#l3.222)
/* <CHARSET> <bitmap> (32 bits per code word) */[](#l3.223)
if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))[](#l3.224)
return ok;[](#l3.225)
set += 8;[](#l3.226)
}[](#l3.227)
/* <CHARSET> <bitmap> */[](#l3.228)
if (ch < 256 &&[](#l3.229)
(set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))[](#l3.230)
return ok;[](#l3.231)
set += 256/SRE_CODE_BITS;[](#l3.232) break;[](#l3.233)
case SRE_OP_RANGE: @@ -446,26 +419,16 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) Py_ssize_t count, block; count = *(set++);
if (sizeof(SRE_CODE) == 2) {[](#l3.240)
if (ch < 0x10000u)[](#l3.241) block = ((unsigned char*)set)[ch >> 8];[](#l3.242)
set += 128;[](#l3.243)
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))[](#l3.244)
return ok;[](#l3.245)
set += count*16;[](#l3.246)
}[](#l3.247)
else {[](#l3.248)
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids[](#l3.249)
* warnings when c's type supports only numbers < N+1 */[](#l3.250)
if (!(ch & ~65535))[](#l3.251)
block = ((unsigned char*)set)[ch >> 8];[](#l3.252)
else[](#l3.253)
block = -1;[](#l3.254)
set += 64;[](#l3.255)
if (block >=0 &&[](#l3.256)
(set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))[](#l3.257)
return ok;[](#l3.258)
set += count*8;[](#l3.259)
}[](#l3.260)
else[](#l3.261)
block = -1;[](#l3.262)
set += 256/sizeof(SRE_CODE);[](#l3.263)
if (block >=0 &&[](#l3.264)
(set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &[](#l3.265)
(1u << (ch & (SRE_CODE_BITS-1)))))[](#l3.266)
return ok;[](#l3.267)
set += count * (256/SRE_CODE_BITS);[](#l3.268) break;[](#l3.269) }[](#l3.270)
@@ -477,35 +440,35 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) } } -LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern); +LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern); LOCAL(Py_ssize_t) -SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) +SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) { SRE_CODE chr;
- SRE_CHAR c;
- SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
- SRE_CHAR* end = (SRE_CHAR )state->end; Py_ssize_t i; / adjust end */
- if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
end = ptr + maxcount*state->charsize;[](#l3.293)
switch (pattern[0]) { case SRE_OP_IN: /* repeated set */ TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
while (ptr < end &&[](#l3.302)
SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))[](#l3.303)
ptr += state->charsize;[](#l3.304)
while (ptr < end && SRE(charset)(pattern + 2, *ptr))[](#l3.305)
ptr++;[](#l3.306) break;[](#l3.307)
case SRE_OP_ANY: /* repeated dot wildcard. */ TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))[](#l3.312)
ptr += state->charsize;[](#l3.313)
while (ptr < end && !SRE_IS_LINEBREAK(*ptr))[](#l3.314)
ptr++;[](#l3.315) break;[](#l3.316)
case SRE_OP_ANY_ALL: @@ -519,75 +482,87 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pa /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)[](#l3.323)
ptr += state->charsize;[](#l3.324)
c = (SRE_CHAR) chr;[](#l3.325)
if ((SRE_CODE) c != chr)[](#l3.327)
; /* literal can't match: doesn't fit in char width */[](#l3.328)
else[](#l3.329)
while (ptr < end && *ptr == c)[](#l3.331)
ptr++;[](#l3.332) break;[](#l3.333)
case SRE_OP_LITERAL_IGNORE: /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)[](#l3.339)
ptr += state->charsize;[](#l3.340)
while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)[](#l3.341)
ptr++;[](#l3.342) break;[](#l3.343)
case SRE_OP_NOT_LITERAL: /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)[](#l3.349)
ptr += state->charsize;[](#l3.350)
c = (SRE_CHAR) chr;[](#l3.351)
if ((SRE_CODE) c != chr)[](#l3.353)
ptr = end; /* literal can't match: doesn't fit in char width */[](#l3.354)
else[](#l3.355)
while (ptr < end && *ptr != c)[](#l3.357)
ptr++;[](#l3.358) break;[](#l3.359)
case SRE_OP_NOT_LITERAL_IGNORE: /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)[](#l3.365)
ptr += state->charsize;[](#l3.366)
while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)[](#l3.367)
ptr++;[](#l3.368) break;[](#l3.369)
default: /* repeated single character pattern */ TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
while ((char*) state->ptr < end) {[](#l3.374)
i = SRE_MATCH(state, pattern);[](#l3.375)
while ((SRE_CHAR*) state->ptr < end) {[](#l3.376)
i = SRE(match)(state, pattern);[](#l3.377) if (i < 0)[](#l3.378) return i;[](#l3.379) if (!i)[](#l3.380) break;[](#l3.381) }[](#l3.382) TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,[](#l3.383)
((char*)state->ptr - ptr)/state->charsize));[](#l3.384)
return ((char*)state->ptr - ptr)/state->charsize;[](#l3.385)
(SRE_CHAR*) state->ptr - ptr));[](#l3.386)
} TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,return (SRE_CHAR*) state->ptr - ptr;[](#l3.387)
(ptr - (char*) state->ptr)/state->charsize));[](#l3.391)
- return (ptr - (char*) state->ptr)/state->charsize;
} #if 0 /* not used in this release / LOCAL(int) -SRE_INFO(SRE_STATE state, SRE_CODE* pattern) +SRE(info)(SRE_STATE* state, SRE_CODE* pattern) { /* check if an SRE_OP_INFO block matches at the current position. returns the number of SRE_CODE objects to skip if successful, 0 if no match */
- SRE_CHAR* end = (SRE_CHAR*) state->end;
- SRE_CHAR* ptr = (SRE_CHAR*) state->ptr; Py_ssize_t i; /* check minimal length */
/* check known prefix / if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { / */ for (i = 0; i < pattern[5]; i++)
if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])[](#l3.421)
} @@ -595,30 +570,30 @@ SRE_INFO(SRE_STATE* state, SRE_CODE* pat }if ((SRE_CODE) ptr[i] != pattern[7 + i])[](#l3.422) return 0;[](#l3.423) return pattern[0] + 2 * pattern[6];[](#l3.424)
#endif -/* The macros below should be used to protect recursive SRE_MATCH() +/* The macros below should be used to protect recursive SRE(match)()
- loop iteration, since the recursive SRE(match)() could do anything,
- and could potentially depend on lastmark. *
- For more information, check the discussion at SF patch #712900. @@ -657,7 +632,7 @@ do { [](#l3.466) int j = data_stack_grow(state, sizeof(type)); [](#l3.467) if (j < 0) return j; [](#l3.468) if (ctx_pos != -1) [](#l3.469)
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \[](#l3.470)
} [](#l3.472) ptr = (type*)(state->data_stack+alloc_pos); [](#l3.473) state->data_stack_base += sizeof(type); [](#l3.474) @@ -678,7 +653,7 @@ do { [](#l3.475) int j = data_stack_grow(state, size); [](#l3.476) if (j < 0) return j; [](#l3.477) if (ctx_pos != -1) [](#l3.478)DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \[](#l3.471)
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \[](#l3.479)
} [](#l3.481) memcpy(state->data_stack+state->data_stack_base, data, size); [](#l3.482) state->data_stack_base += size; [](#l3.483) @@ -747,7 +722,7 @@ do { [](#l3.484)DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \[](#l3.480)
#define JUMP_ASSERT_NOT 13 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) [](#l3.487)
- DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); [](#l3.488)
- DATA_ALLOC(SRE(match_context), nextctx); [](#l3.489) nextctx->last_ctx_pos = ctx_pos; [](#l3.490) nextctx->jump = jumpvalue; [](#l3.491) nextctx->pattern = nextpattern; [](#l3.492) @@ -760,7 +735,7 @@ do { [](#l3.493) typedef struct { Py_ssize_t last_ctx_pos; Py_ssize_t jump;
- SRE_CHAR* ptr; SRE_CODE* pattern; Py_ssize_t count; Py_ssize_t lastmark; @@ -769,25 +744,25 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; -} SRE_MATCH_CONTEXT; +} SRE(match_context); /* check if string matches the given pattern. returns <0 for error, 0 for failure, and 1 for success / LOCAL(Py_ssize_t) -SRE_MATCH(SRE_STATE state, SRE_CODE* pattern) +SRE(match)(SRE_STATE* state, SRE_CODE* pattern) {
- SRE_CHAR* end = (SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; Py_ssize_t i, ret = 0; Py_ssize_t jump; unsigned int sigcount=0;
TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
- DATA_ALLOC(SRE(match_context), ctx); ctx->last_ctx_pos = -1; ctx->jump = JUMP_NONE; ctx->pattern = pattern; @@ -795,16 +770,15 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pa entrance:
if (ctx->pattern[0] == SRE_OP_INFO) { /* optimization info block / / <1=skip> <2=flags> <3=min> ... */
if (ctx->pattern[3] && (Py_uintptr_t)(end - ctx->ptr)/state->charsize < ctx->pattern[3]) {[](#l3.544)
if (ctx->pattern[3] && (Py_uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) {[](#l3.545) TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "[](#l3.546) "need %" PY_FORMAT_SIZE_T "d)\n",[](#l3.547)
(end - ctx->ptr)/state->charsize,[](#l3.548)
(Py_ssize_t) ctx->pattern[3]));[](#l3.549)
end - ctx->ptr, (Py_ssize_t) ctx->pattern[3]));[](#l3.550) RETURN_FAILURE;[](#l3.551) }[](#l3.552) ctx->pattern += ctx->pattern[1] + 1;[](#l3.553)
@@ -844,10 +818,10 @@ entrance:
/* */
TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])[](#l3.558)
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])[](#l3.559) RETURN_FAILURE;[](#l3.560) ctx->pattern++;[](#l3.561)
ctx->ptr += state->charsize;[](#l3.562)
ctx->ptr++;[](#l3.563) break;[](#l3.564)
case SRE_OP_NOT_LITERAL:
@@ -855,10 +829,10 @@ entrance:
/* */
TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])[](#l3.571)
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])[](#l3.572) RETURN_FAILURE;[](#l3.573) ctx->pattern++;[](#l3.574)
ctx->ptr += state->charsize;[](#l3.575)
ctx->ptr++;[](#l3.576) break;[](#l3.577)
case SRE_OP_SUCCESS:
@@ -871,7 +845,7 @@ entrance:
/* match at given position /
/ */
TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
if (!SRE_AT(state, ctx->ptr, *ctx->pattern))[](#l3.584)
if (!SRE(at)(state, ctx->ptr, *ctx->pattern))[](#l3.585) RETURN_FAILURE;[](#l3.586) ctx->pattern++;[](#l3.587) break;[](#l3.588)
@@ -881,19 +855,19 @@ entrance:
/* */
TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))[](#l3.593)
if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))[](#l3.594) RETURN_FAILURE;[](#l3.595) ctx->pattern++;[](#l3.596)
ctx->ptr += state->charsize;[](#l3.597)
ctx->ptr++;[](#l3.598) break;[](#l3.599)
case SRE_OP_ANY: /* match anything (except a newline) / / */ TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))[](#l3.605)
RETURN_FAILURE;[](#l3.606)
ctx->ptr += state->charsize;[](#l3.607)
if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))[](#l3.608)
RETURN_FAILURE;[](#l3.609)
ctx->ptr++;[](#l3.610) break;[](#l3.611)
case SRE_OP_ANY_ALL: @@ -902,47 +876,47 @@ entrance: TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end) RETURN_FAILURE;
ctx->ptr += state->charsize;[](#l3.618)
ctx->ptr++;[](#l3.619) break;[](#l3.620)
case SRE_OP_IN: /* match set member (or non_member) / / */ TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))[](#l3.626)
RETURN_FAILURE;[](#l3.627)
if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr))[](#l3.628)
RETURN_FAILURE;[](#l3.629) ctx->pattern += ctx->pattern[0];[](#l3.630)
ctx->ptr += state->charsize;[](#l3.631)
ctx->ptr++;[](#l3.632) break;[](#l3.633)
case SRE_OP_LITERAL_IGNORE: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end ||
state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))[](#l3.639)
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))[](#l3.640) RETURN_FAILURE;[](#l3.641) ctx->pattern++;[](#l3.642)
ctx->ptr += state->charsize;[](#l3.643)
ctx->ptr++;[](#l3.644) break;[](#l3.645)
case SRE_OP_NOT_LITERAL_IGNORE: TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end ||
state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))[](#l3.651)
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))[](#l3.652) RETURN_FAILURE;[](#l3.653) ctx->pattern++;[](#l3.654)
ctx->ptr += state->charsize;[](#l3.655)
ctx->ptr++;[](#l3.656) break;[](#l3.657)
case SRE_OP_IN_IGNORE: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end
|| !SRE_CHARSET(ctx->pattern+1,[](#l3.662)
(SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))[](#l3.663)
|| !SRE(charset)(ctx->pattern+1,[](#l3.664)
(SRE_CODE)state->lower(*ctx->ptr)))[](#l3.665) RETURN_FAILURE;[](#l3.666) ctx->pattern += ctx->pattern[0];[](#l3.667)
ctx->ptr += state->charsize;[](#l3.668)
ctx->ptr++;[](#l3.669) break;[](#l3.670)
case SRE_OP_JUMP: @@ -965,11 +939,11 @@ entrance: for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { if (ctx->pattern[1] == SRE_OP_LITERAL && (ctx->ptr >= end ||
(SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))[](#l3.677)
(SRE_CODE) *ctx->ptr != ctx->pattern[2]))[](#l3.678) continue;[](#l3.679) if (ctx->pattern[1] == SRE_OP_IN &&[](#l3.680) (ctx->ptr >= end ||[](#l3.681)
!SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))[](#l3.682)
!SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))[](#l3.683) continue;[](#l3.684) state->ptr = ctx->ptr;[](#l3.685) DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);[](#l3.686)
@@ -1000,16 +974,16 @@ entrance: TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)[](#l3.691)
if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)[](#l3.692) RETURN_FAILURE; /* cannot match */[](#l3.693)
ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);[](#l3.697)
ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]);[](#l3.698) RETURN_ON_ERROR(ret);[](#l3.699)
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);[](#l3.700)
DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);[](#l3.701) ctx->count = ret;[](#l3.702)
ctx->ptr += state->charsize * ctx->count;[](#l3.703)
ctx->ptr += ctx->count;[](#l3.704)
/* when we arrive here, count contains the number of matches, and ctx->ptr points to the tail of the target @@ -1033,9 +1007,8 @@ entrance: ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; for (;;) { while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
(ctx->ptr >= end ||[](#l3.712)
SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {[](#l3.713)
ctx->ptr -= state->charsize;[](#l3.714)
(ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {[](#l3.715)
ctx->ptr--;[](#l3.716) ctx->count--;[](#l3.717) }[](#l3.718) if (ctx->count < (Py_ssize_t) ctx->pattern[1])[](#l3.719)
@@ -1050,7 +1023,7 @@ entrance: LASTMARK_RESTORE();
ctx->ptr -= state->charsize;[](#l3.724)
ctx->ptr--;[](#l3.725) ctx->count--;[](#l3.726) }[](#l3.727)
@@ -1064,7 +1037,7 @@ entrance: RETURN_ON_ERROR(ret); RETURN_SUCCESS; }
ctx->ptr -= state->charsize;[](#l3.733)
ctx->ptr--;[](#l3.734) ctx->count--;[](#l3.735) LASTMARK_RESTORE();[](#l3.736) }[](#l3.737)
@@ -1084,7 +1057,7 @@ entrance: TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2]));
if ((Py_ssize_t) ctx->pattern[1] > (end - ctx->ptr) / state->charsize)[](#l3.742)
if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)[](#l3.743) RETURN_FAILURE; /* cannot match */[](#l3.744)
state->ptr = ctx->ptr; @@ -1093,15 +1066,15 @@ entrance: ctx->count = 0; else { /* count using pattern min as the maximum */
ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);[](#l3.751)
ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]);[](#l3.752) RETURN_ON_ERROR(ret);[](#l3.753)
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);[](#l3.754)
DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);[](#l3.755) if (ret < (Py_ssize_t) ctx->pattern[1])[](#l3.756) /* didn't match minimum number of times */[](#l3.757) RETURN_FAILURE;[](#l3.758) /* advance past minimum matches of repeat */[](#l3.759) ctx->count = ret;[](#l3.760)
ctx->ptr += state->charsize * ctx->count;[](#l3.761)
ctx->ptr += ctx->count;[](#l3.762) }[](#l3.763)
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { @@ -1122,13 +1095,13 @@ entrance: RETURN_SUCCESS; } state->ptr = ctx->ptr;
ret = SRE_COUNT(state, ctx->pattern+3, 1);[](#l3.770)
ret = SRE(count)(state, ctx->pattern+3, 1);[](#l3.771) RETURN_ON_ERROR(ret);[](#l3.772)
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);[](#l3.773)
DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);[](#l3.774) if (ret == 0)[](#l3.775) break;[](#l3.776) assert(ret == 1);[](#l3.777)
ctx->ptr += state->charsize;[](#l3.778)
ctx->ptr++;[](#l3.779) ctx->count++;[](#l3.780) LASTMARK_RESTORE();[](#l3.781) }[](#l3.782)
@@ -1305,16 +1278,15 @@ entrance: if (groupref >= state->lastmark) { RETURN_FAILURE; } else {
char* p = (char*) state->mark[groupref];[](#l3.787)
char* e = (char*) state->mark[groupref+1];[](#l3.788)
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];[](#l3.789)
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];[](#l3.790) if (!p || !e || e < p)[](#l3.791) RETURN_FAILURE;[](#l3.792) while (p < e) {[](#l3.793)
if (ctx->ptr >= end ||[](#l3.794)
SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))[](#l3.795)
if (ctx->ptr >= end || *ctx->ptr != *p)[](#l3.796) RETURN_FAILURE;[](#l3.797)
p += state->charsize;[](#l3.798)
ctx->ptr += state->charsize;[](#l3.799)
p++;[](#l3.800)
ctx->ptr++;[](#l3.801) }[](#l3.802) }[](#l3.803) }[](#l3.804)
@@ -1331,17 +1303,16 @@ entrance: if (groupref >= state->lastmark) { RETURN_FAILURE; } else {
char* p = (char*) state->mark[groupref];[](#l3.809)
char* e = (char*) state->mark[groupref+1];[](#l3.810)
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];[](#l3.811)
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];[](#l3.812) if (!p || !e || e < p)[](#l3.813) RETURN_FAILURE;[](#l3.814) while (p < e) {[](#l3.815) if (ctx->ptr >= end ||[](#l3.816)
state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=[](#l3.817)
state->lower(SRE_CHARGET(state, p, 0)))[](#l3.818)
state->lower(*ctx->ptr) != state->lower(*p))[](#l3.819) RETURN_FAILURE;[](#l3.820)
p += state->charsize;[](#l3.821)
ctx->ptr += state->charsize;[](#l3.822)
p++;[](#l3.823)
ctx->ptr++;[](#l3.824) }[](#l3.825) }[](#l3.826) }[](#l3.827)
@@ -1375,7 +1346,7 @@ entrance: /* */ TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1]));
state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];[](#l3.832)
state->ptr = ctx->ptr - ctx->pattern[1];[](#l3.833) if (state->ptr < state->beginning)[](#l3.834) RETURN_FAILURE;[](#l3.835) DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);[](#l3.836)
@@ -1388,7 +1359,7 @@ entrance: /* */ TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1]));
state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];[](#l3.841)
state->ptr = ctx->ptr - ctx->pattern[1];[](#l3.842) if (state->ptr >= state->beginning) {[](#l3.843) DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);[](#l3.844) if (ret) {[](#l3.845)
@@ -1417,7 +1388,7 @@ exit: DATA_POP_DISCARD(ctx); if (ctx_pos == -1) return ret;
switch (jump) { case JUMP_MAX_UNTIL_2: @@ -1469,10 +1440,10 @@ exit: } LOCAL(Py_ssize_t) -SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) +SRE(search)(SRE_STATE* state, SRE_CODE* pattern) {
- SRE_CHAR* ptr = (SRE_CHAR *)state->start;
- SRE_CHAR* end = (SRE_CHAR )state->end; Py_ssize_t status = 0; Py_ssize_t prefix_len = 0; Py_ssize_t prefix_skip = 0; @@ -1490,9 +1461,9 @@ SRE_SEARCH(SRE_STATE state, SRE_CODE* p if (pattern[3] > 1) { /* adjust end point (but make sure we leave at least one character in there, so literal search will work) */
end -= (pattern[3]-1) * state->charsize;[](#l3.873)
end -= pattern[3] - 1;[](#l3.874) if (end <= ptr)[](#l3.875)
end = ptr + state->charsize;[](#l3.876)
end = ptr;[](#l3.877) }[](#l3.878)
if (flags & SRE_INFO_PREFIX) { @@ -1519,32 +1490,47 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* p /* pattern starts with a known prefix. use the overlap table to skip forward as fast as we possibly can */ Py_ssize_t i = 0;
end = (char *)state->end;[](#l3.885)
end = (SRE_CHAR *)state->end;[](#l3.887)
if (prefix_len > end - ptr)[](#l3.888)
return 0;[](#l3.889)
for (i = 0; i < prefix_len; i++)[](#l3.891)
if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i])[](#l3.892)
return 0; /* literal can't match: doesn't fit in char width */[](#l3.893)
for (;;) {[](#l3.896)
if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {[](#l3.897)
if (!i)[](#l3.898)
break;[](#l3.899)
else[](#l3.900)
i = overlap[i];[](#l3.901)
} else {[](#l3.902)
if (++i == prefix_len) {[](#l3.903)
/* found a potential match */[](#l3.904)
TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));[](#l3.905)
state->start = ptr - (prefix_len - 1) * state->charsize;[](#l3.906)
state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;[](#l3.907)
if (flags & SRE_INFO_LITERAL)[](#l3.908)
return 1; /* we got all of it */[](#l3.909)
status = SRE_MATCH(state, pattern + 2*prefix_skip);[](#l3.910)
if (status != 0)[](#l3.911)
return status;[](#l3.912)
/* close but no cigar -- try again */[](#l3.913)
i = overlap[i];[](#l3.914)
SRE_CHAR c = (SRE_CHAR) prefix[0];[](#l3.915)
while (*ptr++ != c) {[](#l3.916)
if (ptr >= end)[](#l3.917)
return 0;[](#l3.918)
}[](#l3.919)
if (ptr >= end)[](#l3.920)
return 0;[](#l3.921)
i = 1;[](#l3.923)
do {[](#l3.924)
if (*ptr == (SRE_CHAR) prefix[i]) {[](#l3.925)
if (++i != prefix_len) {[](#l3.926)
if (++ptr >= end)[](#l3.927)
return 0;[](#l3.928)
continue;[](#l3.929) }[](#l3.930)
break;[](#l3.931)
/* found a potential match */[](#l3.932)
TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));[](#l3.933)
state->start = ptr - (prefix_len - 1);[](#l3.934)
state->ptr = ptr - (prefix_len - prefix_skip - 1);[](#l3.935)
if (flags & SRE_INFO_LITERAL)[](#l3.936)
return 1; /* we got all of it */[](#l3.937)
status = SRE(match)(state, pattern + 2*prefix_skip);[](#l3.938)
if (status != 0)[](#l3.939)
return status;[](#l3.940)
/* close but no cigar -- try again */[](#l3.941)
if (++ptr >= end)[](#l3.942)
return 0;[](#l3.943) }[](#l3.944)
}[](#l3.945)
ptr += state->charsize;[](#l3.946)
i = overlap[i];[](#l3.947)
} @@ -1553,46 +1539,48 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* p if (pattern[0] == SRE_OP_LITERAL) { /* pattern starts with a literal character. this is used for short prefixes, and if fast search is disabled */} while (i != 0);[](#l3.948) }[](#l3.949) return 0;[](#l3.950)
SRE_CODE chr = pattern[1];[](#l3.956)
end = (char*)state->end;[](#l3.957)
for (;;) {[](#l3.958)
while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)[](#l3.959)
ptr += state->charsize;[](#l3.960)
if (ptr >= end)[](#l3.961)
return 0;[](#l3.962)
SRE_CHAR c = (SRE_CHAR) pattern[1];[](#l3.963)
if ((SRE_CODE) c != pattern[1])[](#l3.965)
return 0; /* literal can't match: doesn't fit in char width */[](#l3.966)
end = (SRE_CHAR *)state->end;[](#l3.968)
while (ptr < end) {[](#l3.969)
while (*ptr != c) {[](#l3.970)
if (++ptr >= end)[](#l3.971)
return 0;[](#l3.972)
}[](#l3.973) TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));[](#l3.974) state->start = ptr;[](#l3.975)
ptr += state->charsize;[](#l3.976)
state->ptr = ptr;[](#l3.977)
state->ptr = ++ptr;[](#l3.978) if (flags & SRE_INFO_LITERAL)[](#l3.979) return 1; /* we got all of it */[](#l3.980)
status = SRE_MATCH(state, pattern + 2);[](#l3.981)
} else if (charset) { /* pattern starts with a character from a known set */status = SRE(match)(state, pattern + 2);[](#l3.982) if (status != 0)[](#l3.983) break;[](#l3.984) }[](#l3.985)
end = (char*)state->end;[](#l3.988)
end = (SRE_CHAR *)state->end;[](#l3.989) for (;;) {[](#l3.990)
while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))[](#l3.991)
ptr += state->charsize;[](#l3.992)
while (ptr < end && !SRE(charset)(charset, *ptr))[](#l3.993)
ptr++;[](#l3.994) if (ptr >= end)[](#l3.995) return 0;[](#l3.996) TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));[](#l3.997) state->start = ptr;[](#l3.998) state->ptr = ptr;[](#l3.999)
status = SRE_MATCH(state, pattern);[](#l3.1000)
status = SRE(match)(state, pattern);[](#l3.1001) if (status != 0)[](#l3.1002) break;[](#l3.1003)
ptr += state->charsize;[](#l3.1004)
} else /* general case */ while (ptr <= end) { TRACE(("|%p|%p|SEARCH\n", pattern, ptr));ptr++;[](#l3.1005) }[](#l3.1006)
state->start = state->ptr = ptr;[](#l3.1011)
ptr += state->charsize;[](#l3.1012)
status = SRE_MATCH(state, pattern);[](#l3.1013)
state->start = state->ptr = ptr++;[](#l3.1014)
status = SRE(match)(state, pattern);[](#l3.1015) if (status != 0)[](#l3.1016) break;[](#l3.1017) }[](#l3.1018)
@@ -1600,7 +1588,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* p return status; } -#if !defined(SRE_RECURSIVE) +#endif /* SRE_RECURSIVE / + +#ifndef SRE_RECURSIVE / -------------------------------------------------------------------- / / factories and destructors / @@ -1609,23 +1599,6 @@ SRE_SEARCH(SRE_STATE state, SRE_CODE* p static PyObjectpattern_new_match(PatternObject, SRE_STATE*, int); static PyObjectpattern_scanner(PatternObject, PyObject*, PyObject* kw); -static int -sre_literal_template(int charsize, char* ptr, Py_ssize_t len) -{
- /* check if given string is a literal template (i.e. no escapes) */
- struct {
int charsize;[](#l3.1039)
- } state = {
charsize[](#l3.1041)
- };
- while (len-- > 0) {
if (SRE_CHARGET((&state), ptr, 0) == '\\')[](#l3.1044)
return 0;[](#l3.1045)
ptr += charsize;[](#l3.1046)
- }
- return 1;
-} - static PyObject sre_codesize(PyObject self, PyObject unused) { @@ -1661,72 +1634,41 @@ state_reset(SRE_STATE state) static void* getstring(PyObject* string, Py_ssize_t* p_length,
int* p_logical_charsize, int* p_charsize,[](#l3.1058)
int* p_isbytes, int* p_charsize,[](#l3.1059) Py_buffer *view)[](#l3.1060)
{ /* given a python object, return a data pointer, a length (in characters), and a character size. return NULL if the object is not a string (or not compatible) */
- /* Unicode objects do not support the buffer API. So, get the data directly instead. */ if (PyUnicode_Check(string)) { if (PyUnicode_READY(string) == -1) return NULL;
ptr = PyUnicode_DATA(string);[](#l3.1076) *p_length = PyUnicode_GET_LENGTH(string);[](#l3.1077) *p_charsize = PyUnicode_KIND(string);[](#l3.1078)
*p_logical_charsize = 4;[](#l3.1079)
return ptr;[](#l3.1080)
*p_isbytes = 0;[](#l3.1081)
} /* get pointer to byte string buffer */return PyUnicode_DATA(string);[](#l3.1082)
- view->len = -1;
- buffer = Py_TYPE(string)->tp_as_buffer;
- if (!buffer || !buffer->bf_getbuffer ||
(*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {[](#l3.1089)
PyErr_SetString(PyExc_TypeError, "expected string or buffer");[](#l3.1090)
return NULL;[](#l3.1091)
- }
- if (bytes < 0) {
PyErr_SetString(PyExc_TypeError, "buffer has negative size");[](#l3.1099)
goto err;[](#l3.1100)
- if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
PyErr_SetString(PyExc_TypeError, "expected string or buffer");[](#l3.1102)
}return NULL;[](#l3.1103)
- if (PyBytes_Check(string) || bytes == size)
charsize = 1;[](#l3.1110)
- else {
PyErr_SetString(PyExc_TypeError, "buffer size mismatch");[](#l3.1112)
goto err;[](#l3.1113)
- if (view->buf == NULL) {
PyErr_SetString(PyExc_ValueError, "Buffer is NULL");[](#l3.1119)
PyBuffer_Release(view);[](#l3.1120)
view->buf = NULL;[](#l3.1121)
} -return NULL;[](#l3.1122)
- if (ptr == NULL) {
PyErr_SetString(PyExc_ValueError,[](#l3.1130)
"Buffer is NULL");[](#l3.1131)
goto err;[](#l3.1132)
- }
- return ptr;
- err:
- PyBuffer_Release(view);
- view->buf = NULL;
- return NULL;
} LOCAL(PyObject*) @@ -1736,7 +1678,7 @@ state_init(SRE_STATE* state, PatternObje /* prepare state object */ Py_ssize_t length;
- int isbytes, charsize; void* ptr; memset(state, 0, sizeof(SRE_STATE)); @@ -1745,16 +1687,16 @@ state_init(SRE_STATE* state, PatternObje state->lastindex = -1; state->buffer.buf = NULL;
- if (isbytes && pattern->isbytes == 0) { PyErr_SetString(PyExc_TypeError, "can't use a string pattern on a bytes-like object"); goto err; }
- if (!isbytes && pattern->isbytes > 0) { PyErr_SetString(PyExc_TypeError, "can't use a bytes pattern on a string-like object"); goto err;
@@ -1771,7 +1713,7 @@ state_init(SRE_STATE* state, PatternObje else if (end > length) end = length;
- state->isbytes = isbytes; state->charsize = charsize; state->beginning = ptr; @@ -1812,10 +1754,10 @@ state_fini(SRE_STATE* state) (((char*)(member) - (char*)(state)->beginning) / (state)->charsize) LOCAL(PyObject*) -getslice(int logical_charsize, const void *ptr, +getslice(int isbytes, const void ptr, PyObject string, Py_ssize_t start, Py_ssize_t end)
- if (isbytes) { if (PyBytes_CheckExact(string) && start == 0 && end == PyBytes_GET_SIZE(string)) { Py_INCREF(string);
@@ -1849,7 +1791,7 @@ state_getslice(SRE_STATE* state, Py_ssiz j = STATE_OFFSET(state, state->mark[index+1]); }
} static void @@ -1882,14 +1824,34 @@ pattern_dealloc(PatternObject* self) { if (self->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *) self);
- if (self->view.buf)
Py_XDECREF(self->pattern); Py_XDECREF(self->groupindex); Py_XDECREF(self->indexgroup); PyObject_DEL(self); } +LOCAL(Py_ssize_t) +sre_match(SRE_STATE* state, SRE_CODE* pattern) +{PyBuffer_Release(&self->view);[](#l3.1208)
- if (state->charsize == 1)
return sre_ucs1_match(state, pattern);[](#l3.1219)
- if (state->charsize == 2)
return sre_ucs2_match(state, pattern);[](#l3.1221)
- assert(state->charsize == 4);
- return sre_ucs4_match(state, pattern);
+} + +LOCAL(Py_ssize_t) +sre_search(SRE_STATE* state, SRE_CODE* pattern) +{
- if (state->charsize == 1)
return sre_ucs1_search(state, pattern);[](#l3.1230)
- if (state->charsize == 2)
return sre_ucs2_search(state, pattern);[](#l3.1232)
- assert(state->charsize == 4);
- return sre_ucs4_search(state, pattern);
+} + static PyObject* pattern_match(PatternObject* self, PyObject* args, PyObject* kw) { @@ -1912,11 +1874,7 @@ pattern_match(PatternObject* self, PyObj TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
- if (state.logical_charsize == 1) {
status = sre_match(&state, PatternObject_GetCode(self));[](#l3.1245)
- } else {
status = sre_umatch(&state, PatternObject_GetCode(self));[](#l3.1247)
- }
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) @@ -1947,11 +1905,7 @@ pattern_search(PatternObject* self, PyOb TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
- if (state.logical_charsize == 1) {
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1258)
- } else {
status = sre_usearch(&state, PatternObject_GetCode(self));[](#l3.1260)
- }
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); @@ -2044,12 +1998,7 @@ pattern_findall(PatternObject* self, PyO state.ptr = state.start;
if (state.logical_charsize == 1) {[](#l3.1270)
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1271)
} else {[](#l3.1272)
status = sre_usearch(&state, PatternObject_GetCode(self));[](#l3.1273)
}[](#l3.1274)
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1276) if (PyErr_Occurred())[](#l3.1277) goto error;[](#l3.1278)
@@ -2065,7 +2014,7 @@ pattern_findall(PatternObject* self, PyO case 0: b = STATE_OFFSET(&state, state.start); e = STATE_OFFSET(&state, state.ptr);
item = getslice(state.logical_charsize, state.beginning,[](#l3.1284)
item = getslice(state.isbytes, state.beginning,[](#l3.1285) string, b, e);[](#l3.1286) if (!item)[](#l3.1287) goto error;[](#l3.1288)
@@ -2171,12 +2120,7 @@ pattern_split(PatternObject* self, PyObj state.ptr = state.start;
if (state.logical_charsize == 1) {[](#l3.1293)
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1294)
} else {[](#l3.1295)
status = sre_usearch(&state, PatternObject_GetCode(self));[](#l3.1296)
}[](#l3.1297)
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1299) if (PyErr_Occurred())[](#l3.1300) goto error;[](#l3.1301)
@@ -2196,7 +2140,7 @@ pattern_split(PatternObject* self, PyObj } /* get segment before this match */
item = getslice(state.logical_charsize, state.beginning,[](#l3.1307)
item = getslice(state.isbytes, state.beginning,[](#l3.1308) string, STATE_OFFSET(&state, last),[](#l3.1309) STATE_OFFSET(&state, state.start)[](#l3.1310) );[](#l3.1311)
@@ -2225,7 +2169,7 @@ pattern_split(PatternObject* self, PyObj } /* get segment following last match (even if empty) */
- item = getslice(state.isbytes, state.beginning, string, STATE_OFFSET(&state, last), state.endpos ); if (!item) @@ -2260,7 +2204,7 @@ pattern_subx(PatternObject* self, PyObje Py_ssize_t status; Py_ssize_t n; Py_ssize_t i, b, e;
- int isbytes, charsize; int filter_is_callable; Py_buffer view; @@ -2273,10 +2217,13 @@ pattern_subx(PatternObject* self, PyObje /* if not callable, check if it's a literal string */ int literal; view.buf = NULL;
ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);[](#l3.1334)
ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);[](#l3.1335) b = charsize;[](#l3.1336) if (ptr) {[](#l3.1337)
literal = sre_literal_template(charsize, ptr, n);[](#l3.1338)
if (charsize == 1)[](#l3.1339)
literal = memchr(ptr, '\\', n) == NULL;[](#l3.1340)
else[](#l3.1341)
literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;[](#l3.1342) } else {[](#l3.1343) PyErr_Clear();[](#l3.1344) literal = 0;[](#l3.1345)
@@ -2320,12 +2267,7 @@ pattern_subx(PatternObject* self, PyObje state.ptr = state.start;
if (state.logical_charsize == 1) {[](#l3.1350)
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1351)
} else {[](#l3.1352)
status = sre_usearch(&state, PatternObject_GetCode(self));[](#l3.1353)
}[](#l3.1354)
status = sre_search(&state, PatternObject_GetCode(self));[](#l3.1356) if (PyErr_Occurred())[](#l3.1357) goto error;[](#l3.1358)
@@ -2341,7 +2283,7 @@ pattern_subx(PatternObject* self, PyObje if (i < b) { /* get segment before this match */
item = getslice(state.logical_charsize, state.beginning,[](#l3.1364)
item = getslice(state.isbytes, state.beginning,[](#l3.1365) string, i, b);[](#l3.1366) if (!item)[](#l3.1367) goto error;[](#l3.1368)
@@ -2397,7 +2339,7 @@ next: /* get segment following last match */ if (i < state.endpos) {
item = getslice(state.logical_charsize, state.beginning,[](#l3.1373)
item = getslice(state.isbytes, state.beginning,[](#l3.1374) string, i, state.endpos);[](#l3.1375) if (!item)[](#l3.1376) goto error;[](#l3.1377)
@@ -2412,7 +2354,7 @@ next: Py_DECREF(filter); /* convert list to single string (also removes list) */
- joiner = getslice(state.isbytes, state.beginning, string, 0, 0); if (!joiner) { Py_DECREF(list); return NULL;
@@ -2422,7 +2364,7 @@ next: item = joiner; } else {
if (state.logical_charsize == 1)[](#l3.1391)
if (state.isbytes)[](#l3.1392) item = _PyBytes_Join(joiner, list);[](#l3.1393) else[](#l3.1394) item = PyUnicode_Join(joiner, list);[](#l3.1395)
@@ -2652,7 +2594,6 @@ static PyObject * self->pattern = NULL; self->groupindex = NULL; self->indexgroup = NULL;
self->codesize = n; @@ -2673,16 +2614,20 @@ static PyObject * } if (pattern == Py_None) {
self->logical_charsize = -1;[](#l3.1408)
self->charsize = -1;[](#l3.1409)
if (!getstring(pattern, &p_length, &self->logical_charsize,[](#l3.1414)
&self->charsize, &self->view)) {[](#l3.1415)
int charsize;[](#l3.1416)
Py_buffer view;[](#l3.1417)
view.buf = NULL;[](#l3.1418)
if (!getstring(pattern, &p_length, &self->isbytes,[](#l3.1419)
&charsize, &view)) {[](#l3.1420) Py_DECREF(self);[](#l3.1421) return NULL;[](#l3.1422) }[](#l3.1423)
if (view.buf)[](#l3.1424)
} Py_INCREF(pattern); @@ -2801,7 +2746,7 @@ static int break;PyBuffer_Release(&view);[](#l3.1425)
offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */[](#l3.1433)
offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */[](#l3.1434) if (offset > (Py_uintptr_t)(end - code))[](#l3.1435) FAIL;[](#l3.1436) code += offset;[](#l3.1437)
@@ -2818,7 +2763,7 @@ static int FAIL; } code += offset;
offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */[](#l3.1442)
offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */[](#l3.1443) if (offset > (Py_uintptr_t)(end - code))[](#l3.1444) FAIL;[](#l3.1445) code += offset;[](#l3.1446)
@@ -3188,7 +3133,7 @@ static PyObject* match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def) { Py_ssize_t length;
- int isbytes, charsize; Py_buffer view; PyObject result; void ptr; @@ -3210,12 +3155,12 @@ match_getslice_by_index(MatchObject* sel return def; }
@@ -3790,11 +3735,7 @@ scanner_match(ScannerObject* self, PyObj state->ptr = state->start;
- if (state->logical_charsize == 1) {
status = sre_match(state, PatternObject_GetCode(self->pattern));[](#l3.1477)
- } else {
status = sre_umatch(state, PatternObject_GetCode(self->pattern));[](#l3.1479)
- }
@@ -3821,11 +3762,7 @@ scanner_search(ScannerObject* self, PyOb state->ptr = state->start;
- if (state->logical_charsize == 1) {
status = sre_search(state, PatternObject_GetCode(self->pattern));[](#l3.1490)
- } else {
status = sre_usearch(state, PatternObject_GetCode(self->pattern));[](#l3.1492)
- }
- status = sre_search(state, PatternObject_GetCode(self->pattern)); if (PyErr_Occurred()) return NULL;
@@ -3980,5 +3917,12 @@ PyMODINIT_FUNC PyInit__sre(void) #endif /* !defined(SRE_RECURSIVE) / +#ifdef SRE_RECURSIVE +# undef SRE_RECURSIVE +# undef SRE_CHAR +# undef SIZEOF_SRE_CHAR +# undef SRE +#endif / SRE_RECURSIVE / + / vim:ts=4:sw=4:et */
--- a/Modules/sre.h +++ b/Modules/sre.h @@ -31,9 +31,7 @@ typedef struct { PyObject* pattern; /* pattern source (or None) / int flags; / flags used when compiling pattern source */ PyObject weakreflist; / List of weak references */
- int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) / / pattern code / Py_ssize_t codesize; SRE_CODE code[1]; @@ -73,9 +71,8 @@ typedef struct { / attributes for the match object / PyObject string; Py_ssize_t pos, endpos;