cpython: 1adeac2a8714 (original) (raw)
--- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -18,12 +18,15 @@ from _sre import MAXREPEAT SPECIAL_CHARS = ".\[{()+?^$|" REPEAT_CHARS = "+?{" -DIGITS = set("0123456789") +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") +WHITESPACE = frozenset(" \t\n\r\v\f") -WHITESPACE = set(" \t\n\r\v\f") +_REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT)) +_UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -153,11 +156,9 @@ class SubPattern: self.data.append(code) def getwidth(self): # determine the width (min, max) for this subpattern
if self.width:[](#l1.27)
if self.width is not None:[](#l1.28) return self.width[](#l1.29) lo = hi = 0[](#l1.30)
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)[](#l1.31)
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)[](#l1.32) for op, av in self.data:[](#l1.33) if op is BRANCH:[](#l1.34) i = MAXREPEAT - 1[](#l1.35)
@@ -176,11 +177,11 @@ class SubPattern: i, j = av[1].getwidth() lo = lo + i hi = hi + j
elif op in REPEATCODES:[](#l1.40)
elif op in _REPEATCODES:[](#l1.41) i, j = av[2].getwidth()[](#l1.42) lo = lo + i * av[0][](#l1.43) hi = hi + j * av[1][](#l1.44)
elif op in UNITCODES:[](#l1.45)
elif op in _UNITCODES:[](#l1.46) lo = lo + 1[](#l1.47) hi = hi + 1[](#l1.48) elif op == SUCCESS:[](#l1.49)
@@ -191,34 +192,31 @@ class SubPattern: class Tokenizer: def init(self, string): self.istext = isinstance(string, str)
if not self.istext:[](#l1.54)
def __next(self):string = str(string, 'latin1')[](#l1.55) self.string = string[](#l1.56) self.index = 0[](#l1.57) self.__next()[](#l1.58)
if self.index >= len(self.string):[](#l1.60)
index = self.index[](#l1.61)
try:[](#l1.62)
char = self.string[index][](#l1.63)
except IndexError:[](#l1.64) self.next = None[](#l1.65) return[](#l1.66)
char = self.string[self.index:self.index+1][](#l1.67)
# Special case for the str8, since indexing returns a integer[](#l1.68)
# XXX This is only needed for test_bug_926075 in test_re.py[](#l1.69)
if char and not self.istext:[](#l1.70)
char = chr(char[0])[](#l1.71) if char == "\\":[](#l1.72)
index += 1[](#l1.73) try:[](#l1.74)
c = self.string[self.index + 1][](#l1.75)
char += self.string[index][](#l1.76) except IndexError:[](#l1.77) raise error("bogus escape (end of line)")[](#l1.78)
if not self.istext:[](#l1.79)
c = chr(c)[](#l1.80)
char = char + c[](#l1.81)
self.index = self.index + len(char)[](#l1.82)
self.index = index + 1[](#l1.83) self.next = char[](#l1.84)
if skip:[](#l1.88)
self.__next()[](#l1.89)
return 1[](#l1.90)
return 0[](#l1.91)
self.__next()[](#l1.92)
return True[](#l1.93)
def get(self): this = self.next self.__next()return False[](#l1.94)
@@ -232,6 +230,17 @@ class Tokenizer: result += c self.__next() return result
- def getuntil(self, terminator):
result = ''[](#l1.103)
while True:[](#l1.104)
c = self.next[](#l1.105)
self.__next()[](#l1.106)
if c is None:[](#l1.107)
raise error("unterminated name")[](#l1.108)
if c == terminator:[](#l1.109)
break[](#l1.110)
result += c[](#l1.111)
def tell(self): return self.index, self.next def seek(self, index): @@ -270,7 +279,7 @@ def _class_escape(source, escape): if code: return code code = CATEGORIES.get(escape)return result[](#l1.112)
@@ -279,7 +288,7 @@ def _class_escape(source, escape): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError
return LITERAL, int(escape[2:], 16) & 0xff[](#l1.129)
return LITERAL, int(escape[2:], 16)[](#l1.130) elif c == "u" and source.istext:[](#l1.131) # unicode escape (exactly four digits)[](#l1.132) escape += source.getwhile(4, HEXDIGITS)[](#l1.133)
@@ -325,7 +334,7 @@ def _escape(source, escape, state): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError
return LITERAL, int(escape[2:], 16) & 0xff[](#l1.138)
return LITERAL, int(escape[2:], 16)[](#l1.139) elif c == "u" and source.istext:[](#l1.140) # unicode escape (exactly four digits)[](#l1.141) escape += source.getwhile(4, HEXDIGITS)[](#l1.142)
@@ -347,11 +356,11 @@ def _escape(source, escape, state): elif c in DIGITS: # octal escape or decimal group reference (sigh) if source.next in DIGITS:
escape = escape + source.get()[](#l1.147)
escape += source.get()[](#l1.148) if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and[](#l1.149) source.next in OCTDIGITS):[](#l1.150) # got three octal digits; this is an octal escape[](#l1.151)
escape = escape + source.get()[](#l1.152)
escape += source.get()[](#l1.153) c = int(escape[1:], 8)[](#l1.154) if c > 0o377:[](#l1.155) raise error('octal escape value %r outside of '[](#l1.156)
@@ -370,22 +379,18 @@ def _escape(source, escape, state): pass raise error("bogus escape: %s" % repr(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match
if sourcematch("|"):[](#l1.171)
continue[](#l1.172)
if not nested:[](#l1.173)
if not sourcematch("|"):[](#l1.174) break[](#l1.175)
if not source.next or sourcematch(")", 0):[](#l1.176)
break[](#l1.177)
else:[](#l1.178)
raise error("pattern not properly closed")[](#l1.179)
- if nested and source.next is not None and source.next != ")":
raise error("pattern not properly closed")[](#l1.181)
if len(items) == 1: return items[0] @@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1): subpatternappend = subpattern.append # check if all items share a common prefix
@@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1): # check if the branch can be replaced by a character set for item in items:
if len(item) != 1 or item[0][0] != LITERAL:[](#l1.198)
else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more)if len(item) != 1 or item[0][0] is not LITERAL:[](#l1.199) break[](#l1.200)
set = [][](#l1.204)
setappend = set.append[](#l1.205)
for item in items:[](#l1.206)
setappend(item[0])[](#l1.207)
subpatternappend((IN, set))[](#l1.208)
subpatternappend((IN, [item[0] for item in items]))[](#l1.209) return subpattern[](#l1.210)
subpattern.append((BRANCH, (None, items))) @@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condg item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state)
if source.match("|"):[](#l1.217)
else: item_no = Noneif source.next == "|":[](#l1.218) raise error("conditional backref with more than two branches")[](#l1.219)
- if source.next is not None and source.next != ")":
raise error("pattern not properly closed")
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
-_PATTERNENDERS = set("|)")
-_ASSERTCHARS = set("=!<")
-_LOOKBEHINDASSERTCHARS = set("=!")
-_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
-
def _parse(source, state):
subpattern = SubPattern(state) parse a simple pattern @@ -457,32 +453,35 @@ def _parse(source, state): sourceget = source.get sourcematch = source.match _len = len
- PATTERNENDERS = _PATTERNENDERS
- ASSERTCHARS = _ASSERTCHARS
- LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
- REPEATCODES = _REPEATCODES
if source.next in PATTERNENDERS:[](#l1.251)
break # end of subpattern[](#l1.252)
this = sourceget()[](#l1.253)
this = source.next[](#l1.254) if this is None:[](#l1.255) break # end of pattern[](#l1.256)
if this in "|)":[](#l1.257)
break # end of subpattern[](#l1.258)
sourceget()[](#l1.259)
if state.flags & SRE_FLAG_VERBOSE:[](#l1.261)
if verbose:[](#l1.262) # skip whitespace and comments[](#l1.263) if this in WHITESPACE:[](#l1.264) continue[](#l1.265) if this == "#":[](#l1.266)
while 1:[](#l1.267)
while True:[](#l1.268) this = sourceget()[](#l1.269)
if this in (None, "\n"):[](#l1.270)
if this is None or this == "\n":[](#l1.271) break[](#l1.272) continue[](#l1.273)
if this and this[0] not in SPECIAL_CHARS:[](#l1.275)
subpatternappend((LITERAL, ord(this)))[](#l1.276)
if this[0] == "\\":[](#l1.277)
code = _escape(source, this, state)[](#l1.278)
subpatternappend(code)[](#l1.279)
elif this not in SPECIAL_CHARS:[](#l1.281)
subpatternappend((LITERAL, _ord(this)))[](#l1.282)
elif this == "[": # character set @@ -494,39 +493,38 @@ def _parse(source, state): setappend((NEGATE, None)) # check remaining characters start = set[:]
while 1:[](#l1.290)
while True:[](#l1.291) this = sourceget()[](#l1.292)
if this is None:[](#l1.293)
raise error("unexpected end of regular expression")[](#l1.294) if this == "]" and set != start:[](#l1.295) break[](#l1.296)
elif this and this[0] == "\\":[](#l1.297)
elif this[0] == "\\":[](#l1.298) code1 = _class_escape(source, this)[](#l1.299)
elif this:[](#l1.300)
code1 = LITERAL, ord(this)[](#l1.301) else:[](#l1.302)
raise error("unexpected end of regular expression")[](#l1.303)
code1 = LITERAL, _ord(this)[](#l1.304) if sourcematch("-"):[](#l1.305) # potential range[](#l1.306) this = sourceget()[](#l1.307)
if this is None:[](#l1.308)
raise error("unexpected end of regular expression")[](#l1.309) if this == "]":[](#l1.310) if code1[0] is IN:[](#l1.311) code1 = code1[1][0][](#l1.312) setappend(code1)[](#l1.313)
setappend((LITERAL, ord("-")))[](#l1.314)
setappend((LITERAL, _ord("-")))[](#l1.315) break[](#l1.316)
elif this:[](#l1.317)
if this[0] == "\\":[](#l1.318)
code2 = _class_escape(source, this)[](#l1.319)
else:[](#l1.320)
code2 = LITERAL, ord(this)[](#l1.321)
if code1[0] != LITERAL or code2[0] != LITERAL:[](#l1.322)
raise error("bad character range")[](#l1.323)
lo = code1[1][](#l1.324)
hi = code2[1][](#l1.325)
if hi < lo:[](#l1.326)
raise error("bad character range")[](#l1.327)
setappend((RANGE, (lo, hi)))[](#l1.328)
if this[0] == "\\":[](#l1.329)
code2 = _class_escape(source, this)[](#l1.330) else:[](#l1.331)
raise error("unexpected end of regular expression")[](#l1.332)
code2 = LITERAL, _ord(this)[](#l1.333)
if code1[0] != LITERAL or code2[0] != LITERAL:[](#l1.334)
raise error("bad character range")[](#l1.335)
lo = code1[1][](#l1.336)
hi = code2[1][](#l1.337)
if hi < lo:[](#l1.338)
raise error("bad character range")[](#l1.339)
setappend((RANGE, (lo, hi)))[](#l1.340) else:[](#l1.341) if code1[0] is IN:[](#l1.342) code1 = code1[1][0][](#l1.343)
@@ -541,7 +539,7 @@ def _parse(source, state): # XXX: should add charmap optimization here subpatternappend((IN, set))
elif this and this[0] in REPEAT_CHARS:[](#l1.348)
elif this in REPEAT_CHARS:[](#l1.349) # repeat previous item[](#l1.350) if this == "?":[](#l1.351) min, max = 0, 1[](#l1.352)
@@ -552,20 +550,20 @@ def _parse(source, state): min, max = 1, MAXREPEAT elif this == "{": if source.next == "}":
subpatternappend((LITERAL, ord(this)))[](#l1.357)
subpatternappend((LITERAL, _ord(this)))[](#l1.358) continue[](#l1.359) here = source.tell()[](#l1.360) min, max = 0, MAXREPEAT[](#l1.361) lo = hi = ""[](#l1.362) while source.next in DIGITS:[](#l1.363)
lo = lo + source.get()[](#l1.364)
lo += sourceget()[](#l1.365) if sourcematch(","):[](#l1.366) while source.next in DIGITS:[](#l1.367)
hi = hi + sourceget()[](#l1.368)
hi += sourceget()[](#l1.369) else:[](#l1.370) hi = lo[](#l1.371) if not sourcematch("}"):[](#l1.372)
subpatternappend((LITERAL, ord(this)))[](#l1.373)
subpatternappend((LITERAL, _ord(this)))[](#l1.374) source.seek(here)[](#l1.375) continue[](#l1.376) if lo:[](#l1.377)
@@ -587,7 +585,7 @@ def _parse(source, state): item = None if not item or (_len(item) == 1 and item[0][0] == AT): raise error("nothing to repeat")
if item[0][0] in REPEATCODES:[](#l1.382)
if item[0][0] in _REPEATCODES:[](#l1.383) raise error("multiple repeat")[](#l1.384) if sourcematch("?"):[](#l1.385) subpattern[-1] = (MIN_REPEAT, (min, max, item))[](#l1.386)
@@ -604,18 +602,14 @@ def _parse(source, state): if sourcematch("?"): group = 0 # options
if sourcematch("P"):[](#l1.391)
char = sourceget()[](#l1.392)
if char is None:[](#l1.393)
raise error("unexpected end of pattern")[](#l1.394)
if char == "P":[](#l1.395) # python extensions[](#l1.396) if sourcematch("<"):[](#l1.397) # named group: skip forward to end of name[](#l1.398)
name = ""[](#l1.399)
while 1:[](#l1.400)
char = sourceget()[](#l1.401)
if char is None:[](#l1.402)
raise error("unterminated name")[](#l1.403)
if char == ">":[](#l1.404)
break[](#l1.405)
name = name + char[](#l1.406)
name = source.getuntil(">")[](#l1.407) group = 1[](#l1.408) if not name:[](#l1.409) raise error("missing group name")[](#l1.410)
@@ -623,14 +617,7 @@ def _parse(source, state): raise error("bad character in group name %r" % name) elif sourcematch("="): # named backreference
name = ""[](#l1.415)
while 1:[](#l1.416)
char = sourceget()[](#l1.417)
if char is None:[](#l1.418)
raise error("unterminated name")[](#l1.419)
if char == ")":[](#l1.420)
break[](#l1.421)
name = name + char[](#l1.422)
name = source.getuntil(")")[](#l1.423) if not name:[](#l1.424) raise error("missing group name")[](#l1.425) if not name.isidentifier():[](#l1.426)
@@ -647,27 +634,25 @@ def _parse(source, state): if char is None: raise error("unexpected end of pattern") raise error("unknown specifier: ?P%s" % char)
elif sourcematch(":"):[](#l1.431)
elif char == ":":[](#l1.432) # non-capturing group[](#l1.433) group = 2[](#l1.434)
elif sourcematch("#"):[](#l1.435)
elif char == "#":[](#l1.436) # comment[](#l1.437)
while 1:[](#l1.438)
if source.next is None or source.next == ")":[](#l1.439)
while True:[](#l1.440)
if source.next is None:[](#l1.441)
raise error("unbalanced parenthesis")[](#l1.442)
if sourceget() == ")":[](#l1.443) break[](#l1.444)
sourceget()[](#l1.445)
if not sourcematch(")"):[](#l1.446)
raise error("unbalanced parenthesis")[](#l1.447) continue[](#l1.448)
elif source.next in ASSERTCHARS:[](#l1.449)
elif char in "=!<":[](#l1.450) # lookahead assertions[](#l1.451)
char = sourceget()[](#l1.452) dir = 1[](#l1.453) if char == "<":[](#l1.454)
if source.next not in LOOKBEHINDASSERTCHARS:[](#l1.455)
char = sourceget()[](#l1.456)
if char is None or char not in "=!":[](#l1.457) raise error("syntax error")[](#l1.458) dir = -1 # lookbehind[](#l1.459)
char = sourceget()[](#l1.460) p = _parse_sub(source, state)[](#l1.461) if not sourcematch(")"):[](#l1.462) raise error("unbalanced parenthesis")[](#l1.463)
@@ -676,16 +661,9 @@ def _parse(source, state): else: subpatternappend((ASSERT_NOT, (dir, p))) continue
elif sourcematch("("):[](#l1.468)
elif char == "(":[](#l1.469) # conditional backreference group[](#l1.470)
condname = ""[](#l1.471)
while 1:[](#l1.472)
char = sourceget()[](#l1.473)
if char is None:[](#l1.474)
raise error("unterminated name")[](#l1.475)
if char == ")":[](#l1.476)
break[](#l1.477)
condname = condname + char[](#l1.478)
condname = source.getuntil(")")[](#l1.479) group = 2[](#l1.480) if not condname:[](#l1.481) raise error("missing group name")[](#l1.482)
@@ -705,12 +683,14 @@ def _parse(source, state): raise error("bad group number") if condgroup >= MAXGROUPS: raise error("the group number is too large")
else:[](#l1.487)
elif char in FLAGS:[](#l1.488) # flags[](#l1.489)
if not source.next in FLAGS:[](#l1.490)
raise error("unexpected end of pattern")[](#l1.491)
state.flags |= FLAGS[char][](#l1.492) while source.next in FLAGS:[](#l1.493)
state.flags = state.flags | FLAGS[sourceget()][](#l1.494)
state.flags |= FLAGS[sourceget()][](#l1.495)
verbose = state.flags & SRE_FLAG_VERBOSE[](#l1.496)
else:[](#l1.497)
raise error("unexpected end of pattern " + char)[](#l1.498) if group:[](#l1.499) # parse group contents[](#l1.500) if group == 2:[](#l1.501)
@@ -728,7 +708,7 @@ def _parse(source, state): state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) else:
while 1:[](#l1.506)
while True:[](#l1.507) char = sourceget()[](#l1.508) if char is None:[](#l1.509) raise error("unexpected end of pattern")[](#l1.510)
@@ -742,10 +722,6 @@ def _parse(source, state): elif this == "$": subpattern.append((AT, AT_END))
elif this and this[0] == "\\":[](#l1.515)
code = _escape(source, this, state)[](#l1.516)
subpatternappend(code)[](#l1.517)
- else: raise error("parser error") @@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None): p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags)
- tail = source.get()
- if tail == ")":
raise error("unbalanced parenthesis")[](#l1.528)
- elif tail:
raise error("bogus characters at end of regular expression")[](#l1.530)
- if source.next is not None:
if source.next == ")":[](#l1.532)
raise error("unbalanced parenthesis")[](#l1.533)
else:[](#l1.534)
raise error("bogus characters at end of regular expression")[](#l1.535)
if flags & SRE_FLAG_DEBUG: p.dump() @@ -817,13 +793,7 @@ def parse_template(source, pattern): if c == "g": name = "" if s.match("<"):
while True:[](#l1.543)
char = sget()[](#l1.544)
if char is None:[](#l1.545)
raise error("unterminated group name")[](#l1.546)
if char == ">":[](#l1.547)
break[](#l1.548)
name += char[](#l1.549)
name = s.getuntil(">")[](#l1.550) if not name:[](#l1.551) raise error("missing group name")[](#l1.552) try:[](#l1.553)
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -166,7 +166,9 @@ Core and Builtins Library ------- -- Issue 1519638: Now unmatched groups are replaced with empty strings in re.sub() +- Issue #19380: Optimized parsing of regular expressions. + +- Issue #1519638: Now unmatched groups are replaced with empty strings in re.sub() and re.subn().