cpython: 1adeac2a8714 (original) (raw)

--- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -18,12 +18,15 @@ from _sre import MAXREPEAT SPECIAL_CHARS = ".\[{()+?^$|" REPEAT_CHARS = "+?{" -DIGITS = set("0123456789") +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") +WHITESPACE = frozenset(" \t\n\r\v\f") -WHITESPACE = set(" \t\n\r\v\f") +_REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT)) +_UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -153,11 +156,9 @@ class SubPattern: self.data.append(code) def getwidth(self): # determine the width (min, max) for this subpattern

```
   if self.width:[](#l1.27)
```

   if self.width is not None:[](#l1.28)
       return self.width[](#l1.29)
   lo = hi = 0[](#l1.30)

   UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)[](#l1.31)

   REPEATCODES = (MIN_REPEAT, MAX_REPEAT)[](#l1.32)
   for op, av in self.data:[](#l1.33)
       if op is BRANCH:[](#l1.34)
           i = MAXREPEAT - 1[](#l1.35)

@@ -176,11 +177,11 @@ class SubPattern: i, j = av[1].getwidth() lo = lo + i hi = hi + j

       elif op in REPEATCODES:[](#l1.40)

       elif op in _REPEATCODES:[](#l1.41)
           i, j = av[2].getwidth()[](#l1.42)
           lo = lo + i * av[0][](#l1.43)
           hi = hi + j * av[1][](#l1.44)

```
       elif op in UNITCODES:[](#l1.45)
```

       elif op in _UNITCODES:[](#l1.46)
           lo = lo + 1[](#l1.47)
           hi = hi + 1[](#l1.48)
       elif op == SUCCESS:[](#l1.49)

@@ -191,34 +192,31 @@ class SubPattern: class Tokenizer: def init(self, string): self.istext = isinstance(string, str)

```
   if not self.istext:[](#l1.54)
```

       string = str(string, 'latin1')[](#l1.55)
   self.string = string[](#l1.56)
   self.index = 0[](#l1.57)
   self.__next()[](#l1.58)

def __next(self):

   if self.index >= len(self.string):[](#l1.60)

```
   index = self.index[](#l1.61)
```
```
   try:[](#l1.62)
```

       char = self.string[index][](#l1.63)

   except IndexError:[](#l1.64)
       self.next = None[](#l1.65)
       return[](#l1.66)

   char = self.string[self.index:self.index+1][](#l1.67)

   # Special case for the str8, since indexing returns a integer[](#l1.68)

   # XXX This is only needed for test_bug_926075 in test_re.py[](#l1.69)

   if char and not self.istext:[](#l1.70)

       char = chr(char[0])[](#l1.71)
   if char == "\\":[](#l1.72)

       index += 1[](#l1.73)
       try:[](#l1.74)

           c = self.string[self.index + 1][](#l1.75)

           char += self.string[index][](#l1.76)
       except IndexError:[](#l1.77)
           raise error("bogus escape (end of line)")[](#l1.78)

```
       if not self.istext:[](#l1.79)
```
```
           c = chr(c)[](#l1.80)
```
```
       char = char + c[](#l1.81)
```

   self.index = self.index + len(char)[](#l1.82)

   self.index = index + 1[](#l1.83)
   self.next = char[](#l1.84)

def match(self, char, skip=1):

def match(self, char): if char == self.next:

```
       if skip:[](#l1.88)
```
```
           self.__next()[](#l1.89)
```
```
       return 1[](#l1.90)
```
```
   return 0[](#l1.91)
```

```
       self.__next()[](#l1.92)
```
```
       return True[](#l1.93)
```
```
   return False[](#l1.94)
```
def get(self): this = self.next self.__next()

@@ -232,6 +230,17 @@ class Tokenizer: result += c self.__next() return result

def getuntil(self, terminator):
```
   result = ''[](#l1.103)
```
```
   while True:[](#l1.104)
```
```
       c = self.next[](#l1.105)
```
```
       self.__next()[](#l1.106)
```
```
       if c is None:[](#l1.107)
```

           raise error("unterminated name")[](#l1.108)

```
       if c == terminator:[](#l1.109)
```
```
           break[](#l1.110)
```
```
       result += c[](#l1.111)
```
```
   return result[](#l1.112)
```
def tell(self): return self.index, self.next def seek(self, index): @@ -270,7 +279,7 @@ def _class_escape(source, escape): if code: return code code = CATEGORIES.get(escape)

if code and code[0] == IN:

if code and code[0] is IN: return code try: c = escape[1:2]

@@ -279,7 +288,7 @@ def _class_escape(source, escape): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError

       return LITERAL, int(escape[2:], 16) & 0xff[](#l1.129)

       return LITERAL, int(escape[2:], 16)[](#l1.130)
   elif c == "u" and source.istext:[](#l1.131)
       # unicode escape (exactly four digits)[](#l1.132)
       escape += source.getwhile(4, HEXDIGITS)[](#l1.133)

@@ -325,7 +334,7 @@ def _escape(source, escape, state): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError

       return LITERAL, int(escape[2:], 16) & 0xff[](#l1.138)

       return LITERAL, int(escape[2:], 16)[](#l1.139)
   elif c == "u" and source.istext:[](#l1.140)
       # unicode escape (exactly four digits)[](#l1.141)
       escape += source.getwhile(4, HEXDIGITS)[](#l1.142)

@@ -347,11 +356,11 @@ def _escape(source, escape, state): elif c in DIGITS: # octal escape or decimal group reference (sigh) if source.next in DIGITS:

           escape = escape + source.get()[](#l1.147)

           escape += source.get()[](#l1.148)
           if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and[](#l1.149)
               source.next in OCTDIGITS):[](#l1.150)
               # got three octal digits; this is an octal escape[](#l1.151)

               escape = escape + source.get()[](#l1.152)

               escape += source.get()[](#l1.153)
               c = int(escape[1:], 8)[](#l1.154)
               if c > 0o377:[](#l1.155)
                   raise error('octal escape value %r outside of '[](#l1.156)

@@ -370,22 +379,18 @@ def _escape(source, escape, state): pass raise error("bogus escape: %s" % repr(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match

while 1:

while True: itemsappend(_parse(source, state))

```
   if sourcematch("|"):[](#l1.171)
```
```
       continue[](#l1.172)
```
```
   if not nested:[](#l1.173)
```

   if not sourcematch("|"):[](#l1.174)
       break[](#l1.175)

   if not source.next or sourcematch(")", 0):[](#l1.176)

```
       break[](#l1.177)
```
```
   else:[](#l1.178)
```

       raise error("pattern not properly closed")[](#l1.179)

if nested and source.next is not None and source.next != ")":

   raise error("pattern not properly closed")[](#l1.181)

if len(items) == 1: return items[0] @@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1): subpatternappend = subpattern.append # check if all items share a common prefix

while 1:

while True: prefix = None for item in items: if not item:

@@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1): # check if the branch can be replaced by a character set for item in items:

   if len(item) != 1 or item[0][0] != LITERAL:[](#l1.198)

```
   if len(item) != 1 or item[0][0] is not LITERAL:[](#l1.199)
       break[](#l1.200)
```
else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more)

```
   set = [][](#l1.204)
```
```
   setappend = set.append[](#l1.205)
```
```
   for item in items:[](#l1.206)
```
```
       setappend(item[0])[](#l1.207)
```

   subpatternappend((IN, set))[](#l1.208)

   subpatternappend((IN, [item[0] for item in items]))[](#l1.209)
   return subpattern[](#l1.210)

subpattern.append((BRANCH, (None, items))) @@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condg item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state)

```
   if source.match("|"):[](#l1.217)
```

   if source.next == "|":[](#l1.218)
       raise error("conditional backref with more than two branches")[](#l1.219)

else: item_no = None

if source.next and not source.match(")", 0):

if source.next is not None and source.next != ")": raise error("pattern not properly closed") subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) - def _parse(source, state):
parse a simple pattern
subpattern = SubPattern(state) @@ -457,32 +453,35 @@ def _parse(source, state): sourceget = source.get sourcematch = source.match _len = len

PATTERNENDERS = _PATTERNENDERS
ASSERTCHARS = _ASSERTCHARS
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
REPEATCODES = _REPEATCODES

_ord = ord
verbose = state.flags & SRE_FLAG_VERBOSE

while 1:

while True:

   if source.next in PATTERNENDERS:[](#l1.251)

       break # end of subpattern[](#l1.252)

```
   this = sourceget()[](#l1.253)
```

   this = source.next[](#l1.254)
   if this is None:[](#l1.255)
       break # end of pattern[](#l1.256)

```
   if this in "|)":[](#l1.257)
```

       break # end of subpattern[](#l1.258)

```
   sourceget()[](#l1.259)
```

   if state.flags & SRE_FLAG_VERBOSE:[](#l1.261)

   if verbose:[](#l1.262)
       # skip whitespace and comments[](#l1.263)
       if this in WHITESPACE:[](#l1.264)
           continue[](#l1.265)
       if this == "#":[](#l1.266)

```
           while 1:[](#l1.267)
```

           while True:[](#l1.268)
               this = sourceget()[](#l1.269)

               if this in (None, "\n"):[](#l1.270)

               if this is None or this == "\n":[](#l1.271)
                   break[](#l1.272)
           continue[](#l1.273)

   if this and this[0] not in SPECIAL_CHARS:[](#l1.275)

       subpatternappend((LITERAL, ord(this)))[](#l1.276)

```
   if this[0] == "\\":[](#l1.277)
```

       code = _escape(source, this, state)[](#l1.278)

       subpatternappend(code)[](#l1.279)

   elif this not in SPECIAL_CHARS:[](#l1.281)

       subpatternappend((LITERAL, _ord(this)))[](#l1.282)

elif this == "[": # character set @@ -494,39 +493,38 @@ def _parse(source, state): setappend((NEGATE, None)) # check remaining characters start = set[:]

```
       while 1:[](#l1.290)
```

       while True:[](#l1.291)
           this = sourceget()[](#l1.292)

```
           if this is None:[](#l1.293)
```

               raise error("unexpected end of regular expression")[](#l1.294)
           if this == "]" and set != start:[](#l1.295)
               break[](#l1.296)

           elif this and this[0] == "\\":[](#l1.297)

           elif this[0] == "\\":[](#l1.298)
               code1 = _class_escape(source, this)[](#l1.299)

```
           elif this:[](#l1.300)
```

               code1 = LITERAL, ord(this)[](#l1.301)
           else:[](#l1.302)

               raise error("unexpected end of regular expression")[](#l1.303)

               code1 = LITERAL, _ord(this)[](#l1.304)
           if sourcematch("-"):[](#l1.305)
               # potential range[](#l1.306)
               this = sourceget()[](#l1.307)

               if this is None:[](#l1.308)

                   raise error("unexpected end of regular expression")[](#l1.309)
               if this == "]":[](#l1.310)
                   if code1[0] is IN:[](#l1.311)
                       code1 = code1[1][0][](#l1.312)
                   setappend(code1)[](#l1.313)

                   setappend((LITERAL, ord("-")))[](#l1.314)

                   setappend((LITERAL, _ord("-")))[](#l1.315)
                   break[](#l1.316)

```
               elif this:[](#l1.317)
```

                   if this[0] == "\\":[](#l1.318)

                       code2 = _class_escape(source, this)[](#l1.319)

```
                   else:[](#l1.320)
```

                       code2 = LITERAL, ord(this)[](#l1.321)

                   if code1[0] != LITERAL or code2[0] != LITERAL:[](#l1.322)

                       raise error("bad character range")[](#l1.323)

                   lo = code1[1][](#l1.324)

                   hi = code2[1][](#l1.325)

                   if hi < lo:[](#l1.326)

                       raise error("bad character range")[](#l1.327)

                   setappend((RANGE, (lo, hi)))[](#l1.328)

               if this[0] == "\\":[](#l1.329)

                   code2 = _class_escape(source, this)[](#l1.330)
               else:[](#l1.331)

                   raise error("unexpected end of regular expression")[](#l1.332)

                   code2 = LITERAL, _ord(this)[](#l1.333)

               if code1[0] != LITERAL or code2[0] != LITERAL:[](#l1.334)

                   raise error("bad character range")[](#l1.335)

```
               lo = code1[1][](#l1.336)
```
```
               hi = code2[1][](#l1.337)
```
```
               if hi < lo:[](#l1.338)
```

                   raise error("bad character range")[](#l1.339)

               setappend((RANGE, (lo, hi)))[](#l1.340)
           else:[](#l1.341)
               if code1[0] is IN:[](#l1.342)
                   code1 = code1[1][0][](#l1.343)

@@ -541,7 +539,7 @@ def _parse(source, state): # XXX: should add charmap optimization here subpatternappend((IN, set))

   elif this and this[0] in REPEAT_CHARS:[](#l1.348)

   elif this in REPEAT_CHARS:[](#l1.349)
       # repeat previous item[](#l1.350)
       if this == "?":[](#l1.351)
           min, max = 0, 1[](#l1.352)

@@ -552,20 +550,20 @@ def _parse(source, state): min, max = 1, MAXREPEAT elif this == "{": if source.next == "}":

               subpatternappend((LITERAL, ord(this)))[](#l1.357)

               subpatternappend((LITERAL, _ord(this)))[](#l1.358)
               continue[](#l1.359)
           here = source.tell()[](#l1.360)
           min, max = 0, MAXREPEAT[](#l1.361)
           lo = hi = ""[](#l1.362)
           while source.next in DIGITS:[](#l1.363)

               lo = lo + source.get()[](#l1.364)

               lo += sourceget()[](#l1.365)
           if sourcematch(","):[](#l1.366)
               while source.next in DIGITS:[](#l1.367)

                   hi = hi + sourceget()[](#l1.368)

                   hi += sourceget()[](#l1.369)
           else:[](#l1.370)
               hi = lo[](#l1.371)
           if not sourcematch("}"):[](#l1.372)

               subpatternappend((LITERAL, ord(this)))[](#l1.373)

               subpatternappend((LITERAL, _ord(this)))[](#l1.374)
               source.seek(here)[](#l1.375)
               continue[](#l1.376)
           if lo:[](#l1.377)

@@ -587,7 +585,7 @@ def _parse(source, state): item = None if not item or (_len(item) == 1 and item[0][0] == AT): raise error("nothing to repeat")

       if item[0][0] in REPEATCODES:[](#l1.382)

       if item[0][0] in _REPEATCODES:[](#l1.383)
           raise error("multiple repeat")[](#l1.384)
       if sourcematch("?"):[](#l1.385)
           subpattern[-1] = (MIN_REPEAT, (min, max, item))[](#l1.386)

@@ -604,18 +602,14 @@ def _parse(source, state): if sourcematch("?"): group = 0 # options

           if sourcematch("P"):[](#l1.391)

           char = sourceget()[](#l1.392)

```
           if char is None:[](#l1.393)
```

               raise error("unexpected end of pattern")[](#l1.394)

           if char == "P":[](#l1.395)
               # python extensions[](#l1.396)
               if sourcematch("<"):[](#l1.397)
                   # named group: skip forward to end of name[](#l1.398)

```
                   name = ""[](#l1.399)
```
```
                   while 1:[](#l1.400)
```

                       char = sourceget()[](#l1.401)

                       if char is None:[](#l1.402)

                           raise error("unterminated name")[](#l1.403)

                       if char == ">":[](#l1.404)

                           break[](#l1.405)

                       name = name + char[](#l1.406)

                   name = source.getuntil(">")[](#l1.407)
                   group = 1[](#l1.408)
                   if not name:[](#l1.409)
                       raise error("missing group name")[](#l1.410)

@@ -623,14 +617,7 @@ def _parse(source, state): raise error("bad character in group name %r" % name) elif sourcematch("="): # named backreference

```
                   name = ""[](#l1.415)
```
```
                   while 1:[](#l1.416)
```

                       char = sourceget()[](#l1.417)

                       if char is None:[](#l1.418)

                           raise error("unterminated name")[](#l1.419)

                       if char == ")":[](#l1.420)

                           break[](#l1.421)

                       name = name + char[](#l1.422)

                   name = source.getuntil(")")[](#l1.423)
                   if not name:[](#l1.424)
                       raise error("missing group name")[](#l1.425)
                   if not name.isidentifier():[](#l1.426)

@@ -647,27 +634,25 @@ def _parse(source, state): if char is None: raise error("unexpected end of pattern") raise error("unknown specifier: ?P%s" % char)

           elif sourcematch(":"):[](#l1.431)

           elif char == ":":[](#l1.432)
               # non-capturing group[](#l1.433)
               group = 2[](#l1.434)

           elif sourcematch("#"):[](#l1.435)

           elif char == "#":[](#l1.436)
               # comment[](#l1.437)

```
               while 1:[](#l1.438)
```

                   if source.next is None or source.next == ")":[](#l1.439)

```
               while True:[](#l1.440)
```

                   if source.next is None:[](#l1.441)

                       raise error("unbalanced parenthesis")[](#l1.442)

                   if sourceget() == ")":[](#l1.443)
                       break[](#l1.444)

                   sourceget()[](#l1.445)

               if not sourcematch(")"):[](#l1.446)

                   raise error("unbalanced parenthesis")[](#l1.447)
               continue[](#l1.448)

           elif source.next in ASSERTCHARS:[](#l1.449)

           elif char in "=!<":[](#l1.450)
               # lookahead assertions[](#l1.451)

               char = sourceget()[](#l1.452)
               dir = 1[](#l1.453)
               if char == "<":[](#l1.454)

                   if source.next not in LOOKBEHINDASSERTCHARS:[](#l1.455)

                   char = sourceget()[](#l1.456)

                   if char is None or char not in "=!":[](#l1.457)
                       raise error("syntax error")[](#l1.458)
                   dir = -1 # lookbehind[](#l1.459)

                   char = sourceget()[](#l1.460)
               p = _parse_sub(source, state)[](#l1.461)
               if not sourcematch(")"):[](#l1.462)
                   raise error("unbalanced parenthesis")[](#l1.463)

@@ -676,16 +661,9 @@ def _parse(source, state): else: subpatternappend((ASSERT_NOT, (dir, p))) continue

           elif sourcematch("("):[](#l1.468)

           elif char == "(":[](#l1.469)
               # conditional backreference group[](#l1.470)

```
               condname = ""[](#l1.471)
```
```
               while 1:[](#l1.472)
```

                   char = sourceget()[](#l1.473)

                   if char is None:[](#l1.474)

                       raise error("unterminated name")[](#l1.475)

                   if char == ")":[](#l1.476)

```
                       break[](#l1.477)
```

                   condname = condname + char[](#l1.478)

               condname = source.getuntil(")")[](#l1.479)
               group = 2[](#l1.480)
               if not condname:[](#l1.481)
                   raise error("missing group name")[](#l1.482)

@@ -705,12 +683,14 @@ def _parse(source, state): raise error("bad group number") if condgroup >= MAXGROUPS: raise error("the group number is too large")

```
           else:[](#l1.487)
```

           elif char in FLAGS:[](#l1.488)
               # flags[](#l1.489)

               if not source.next in FLAGS:[](#l1.490)

                   raise error("unexpected end of pattern")[](#l1.491)

               state.flags |= FLAGS[char][](#l1.492)
               while source.next in FLAGS:[](#l1.493)

                   state.flags = state.flags | FLAGS[sourceget()][](#l1.494)

                   state.flags |= FLAGS[sourceget()][](#l1.495)

               verbose = state.flags & SRE_FLAG_VERBOSE[](#l1.496)

```
           else:[](#l1.497)
```

               raise error("unexpected end of pattern " + char)[](#l1.498)
       if group:[](#l1.499)
           # parse group contents[](#l1.500)
           if group == 2:[](#l1.501)

@@ -728,7 +708,7 @@ def _parse(source, state): state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) else:

```
           while 1:[](#l1.506)
```

           while True:[](#l1.507)
               char = sourceget()[](#l1.508)
               if char is None:[](#l1.509)
                   raise error("unexpected end of pattern")[](#l1.510)

@@ -742,10 +722,6 @@ def _parse(source, state): elif this == "$": subpattern.append((AT, AT_END))

   elif this and this[0] == "\\":[](#l1.515)

       code = _escape(source, this, state)[](#l1.516)

       subpatternappend(code)[](#l1.517)

- else: raise error("parser error") @@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None): p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags)

tail = source.get()
if tail == ")":

   raise error("unbalanced parenthesis")[](#l1.528)

elif tail:

   raise error("bogus characters at end of regular expression")[](#l1.530)

if source.next is not None:
```
   if source.next == ")":[](#l1.532)
```

       raise error("unbalanced parenthesis")[](#l1.533)

```
   else:[](#l1.534)
```

       raise error("bogus characters at end of regular expression")[](#l1.535)

if flags & SRE_FLAG_DEBUG: p.dump() @@ -817,13 +793,7 @@ def parse_template(source, pattern): if c == "g": name = "" if s.match("<"):

```
               while True:[](#l1.543)
```

                   char = sget()[](#l1.544)

                   if char is None:[](#l1.545)

                       raise error("unterminated group name")[](#l1.546)

                   if char == ">":[](#l1.547)

```
                       break[](#l1.548)
```

                   name += char[](#l1.549)

               name = s.getuntil(">")[](#l1.550)
           if not name:[](#l1.551)
               raise error("missing group name")[](#l1.552)
           try:[](#l1.553)

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -166,7 +166,9 @@ Core and Builtins Library ------- -- Issue 1519638: Now unmatched groups are replaced with empty strings in re.sub() +- Issue #19380: Optimized parsing of regular expressions. + +- Issue #1519638: Now unmatched groups are replaced with empty strings in re.sub() and re.subn().

Issue #18615: sndhdr.what/whathdr now return a namedtuple.

cpython: 1adeac2a8714 (original) (raw)

parse a simple pattern