cpython: 1adeac2a8714 (original) (raw)

--- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -18,12 +18,15 @@ from _sre import MAXREPEAT SPECIAL_CHARS = ".\[{()+?^$|" REPEAT_CHARS = "+?{" -DIGITS = set("0123456789") +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") +WHITESPACE = frozenset(" \t\n\r\v\f") -WHITESPACE = set(" \t\n\r\v\f") +_REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT)) +_UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -153,11 +156,9 @@ class SubPattern: self.data.append(code) def getwidth(self): # determine the width (min, max) for this subpattern

@@ -176,11 +177,11 @@ class SubPattern: i, j = av[1].getwidth() lo = lo + i hi = hi + j

@@ -191,34 +192,31 @@ class SubPattern: class Tokenizer: def init(self, string): self.istext = isinstance(string, str)

@@ -232,6 +230,17 @@ class Tokenizer: result += c self.__next() return result

@@ -279,7 +288,7 @@ def _class_escape(source, escape): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError

@@ -325,7 +334,7 @@ def _escape(source, escape, state): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError

@@ -347,11 +356,11 @@ def _escape(source, escape, state): elif c in DIGITS: # octal escape or decimal group reference (sigh) if source.next in DIGITS:

@@ -370,22 +379,18 @@ def _escape(source, escape, state): pass raise error("bogus escape: %s" % repr(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match

if len(items) == 1: return items[0] @@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1): subpatternappend = subpattern.append # check if all items share a common prefix

@@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1): # check if the branch can be replaced by a character set for item in items:

subpattern.append((BRANCH, (None, items))) @@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condg item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state)

+

elif this == "[": # character set @@ -494,39 +493,38 @@ def _parse(source, state): setappend((NEGATE, None)) # check remaining characters start = set[:]

@@ -541,7 +539,7 @@ def _parse(source, state): # XXX: should add charmap optimization here subpatternappend((IN, set))

@@ -552,20 +550,20 @@ def _parse(source, state): min, max = 1, MAXREPEAT elif this == "{": if source.next == "}":

@@ -587,7 +585,7 @@ def _parse(source, state): item = None if not item or (_len(item) == 1 and item[0][0] == AT): raise error("nothing to repeat")

@@ -604,18 +602,14 @@ def _parse(source, state): if sourcematch("?"): group = 0 # options

@@ -623,14 +617,7 @@ def _parse(source, state): raise error("bad character in group name %r" % name) elif sourcematch("="): # named backreference

@@ -647,27 +634,25 @@ def _parse(source, state): if char is None: raise error("unexpected end of pattern") raise error("unknown specifier: ?P%s" % char)

@@ -676,16 +661,9 @@ def _parse(source, state): else: subpatternappend((ASSERT_NOT, (dir, p))) continue

@@ -705,12 +683,14 @@ def _parse(source, state): raise error("bad group number") if condgroup >= MAXGROUPS: raise error("the group number is too large")

@@ -728,7 +708,7 @@ def _parse(source, state): state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) else:

@@ -742,10 +722,6 @@ def _parse(source, state): elif this == "$": subpattern.append((AT, AT_END))

- else: raise error("parser error") @@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None): p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags)

if flags & SRE_FLAG_DEBUG: p.dump() @@ -817,13 +793,7 @@ def parse_template(source, pattern): if c == "g": name = "" if s.match("<"):

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -166,7 +166,9 @@ Core and Builtins Library ------- -- Issue 1519638: Now unmatched groups are replaced with empty strings in re.sub() +- Issue #19380: Optimized parsing of regular expressions. + +- Issue #1519638: Now unmatched groups are replaced with empty strings in re.sub() and re.subn().