(original) (raw)

# rxb, a simple regular expression builder (by Ka-Ping Yee, 20 Sept 1996) # # 1996-10-22: changed to backslash-parens everywhere as a workaround for the # regex.symcomp() bug pointed out by William S. Lear rael@dejanews.com # # 1996-11-08: bug reported by Jonathan Giddy jon@dstc.edu.au # literal parentheses no longer escaped # # 2000-01-26: converted for re module; added sub, split, followedby """rxb, a simple regular expression builder (by Ka-Ping Yee, 20 Sept 1996) From an idea by Greg Ewing on comp.lang.python. This module encapsulates the construction and functionality of regular expressions in a class named 'Pattern'. To build 'Pattern's, use the functions and constants in this module; you should not need to instance the 'Pattern' class directly unless you are actually supplying a real (awk-style) regular expression. You can concatenate 'Pattern' instances using the '+' operator or repeat them using the '*' operator with a number. The available functions are: exactly() :: exactly the given string anybut() :: text not containing the string member(, , ...) :: any single char mentioned nonmember(, , ...) :: any single char not mentioned maybe() :: zero or one occurrence some() :: one or more occurrences any() :: zero or more occurrences either(, , ...) :: one of the alternatives label(, ) :: label a subgroup for later followedby() :: positive lookahead assertion notfollowedby() :: negative lookahead assertion For 'label' you can also use the alternate, more concise syntax label.() The 'followedby' and 'notfollowedby' functions indicate that you want to look for a match after a particular point, or make sure that there is *not* a match after a particular point, without actually consuming any of the string being matched. The first four functions only accept literal strings. The rest all accept either literals or 'Pattern's otherwise created by this module. Note that 'exactly()' is necessary only if used alone, since any string will be converted from a literal to a 'Pattern' by any of the other operations (including '+'). 'member()' and 'nonmember()' accept any literal characters or strings of characters among their arguments, as well as the special constants 'letters', 'digits', 'hexdigits', 'wordchars', and 'whitespace' from this module. (The corresponding constants starting with 'non-' do not work here.) You can also give to 'member()' or 'nonmember()' a sequence created using 'chrange(, )'. For your convenience, the following 'Pattern' constants are also available: letter, letters :: any small or capital letter digit, digits :: any digit wordchar, wordchars :: letter, digit, or underscore hexdigit, hexdigits :: any hexadecimal digit whitespace :: space, return, newline, tab anychar, anychars :: any single character nonletter, nondigit, nonwordchar, nonhexdigit, or nonwhitespace :: any char other than the indicated type begline, endline :: beginning or end of line anything :: any number of non-newlines something :: one or more non-newlines anyspace :: any amount of whitespace somespace :: one or more whitespace chars When you're done constructing, use these 'Pattern' methods to do real work: match([, ]) :: match at beginning of string or at index search([, ]) :: find anywhere in string or after index sub(repl, string[, ]) :: substitute (at most 'count' times) subn(repl, string[, ]) :: substitute and also return count of hits split(string[, ]) :: split (into at most given # of pieces) imatch([, ]) :: case-insensitive match isearch([, ]) :: case-insensitive search Each 'Pattern' will manage its own compilation. If for some reason you must get the compiled regular expression (compiled using Python's built-in 're' module) you can use the 'compile()' and 'icompile()' methods. The following 'group' method and attributes work both on the 'Match' object returned by one of the above four methods, or on the 'Pattern' object itself where they refer to the last match or search attempt. found :: the entire string that matched before :: everything before what matched after :: everything after what matched group() :: the string that matched a group start([]) :: the index where a group started end([]) :: the index where a group ended span([]) :: a group's starting and ending indices string :: the whole string we tried to match If no argument is given, 'group()', 'start()', 'end()', and 'span()' return information about the entire string that matched. Instead of 'pat.group()' you can use the more concise syntax pat.as long as you don't use labels named 'match', 'search', etc. A note about importing: Since you may be using lots of functions from this module together at once, you may be tempted to do 'from rxb import *'. If you want to accomplish this, but don't want to damage your current namespace, try the 'welcome()' and 'banish()' functions instead. The 'welcome()' function takes one optional argument, the module to "move in" to; if omitted, it defaults to the module from which welcome() is called. Here's what happens: >>> letter = 'hello' >>> def digit(x, y): return x + y >>> import rxb >>> letter 'hello' >>> rxb.welcome() # selected names are saved and rebound >>> letter >>> digit # now you don't have to write "rxb.digit" # ... do your regex work here ... >>> rxb.banish() # our names are restored >>> letter 'hello' >>> digit(8, 9) 17 """ import re, string error = "rxb.error" # ------------------------------------------------------------- Pattern class class Pattern: """Class encapsulating regular expression functionality. This class just stores one regular-expression string (in awk syntax), and allows you to use the addition operator (with other Patterns or ordinary strings) and the multiplication operator (with integers). It produces and caches its own compiled-regex object so you can use searching methods on a 'Pattern' object. (Actually, there may be two regex objects, one case-sensitive and one case-insensitive.)""" def __init__(self, regex): # can init with a regular expression self.regex = regex self.prog = None self.iprog = None self.lastmatch = None def __add__(self, other): return Pattern(self.regex + makepat(other).regex) def __radd__(self, other): return Pattern(makepat(other).regex + self.regex) def __mul__(self, number): return Pattern(self.regex * number) def __rmul__(self, number): return Pattern(self.regex * number) def __repr__(self): return "" def compile(self): if not self.prog: self.prog = re.compile(self.regex) return self.prog def icompile(self): if not self.iprog: self.iprog = regex.compile(self.regex, re.IGNORECASE) return self.iprog def search(self, string, start=0): self.lastmatch = self.compile().search(string, start) return self.lastmatch and Match(self.lastmatch) def isearch(self, string, start=0): self.lastmatch = self.icompile().search(string, start) return self.lastmatch and Match(self.lastmatch) def match(self, string, start=0): self.lastmatch = self.compile().match(string, start) return self.lastmatch and Match(self.lastmatch) def imatch(self, string, start=0): self.lastmatch = self.icompile().match(string, start) return self.lastmatch and Match(self.lastmatch) def split(self, string, pieces=0): if pieces == 0: return self.compile().split(string) if pieces == 1: return string return self.compile().split(string, pieces-1) def isplit(self, string, pieces=0): if pieces == 0: return self.icompile().split(string) if pieces == 1: return string return self.icompile().split(string, pieces-1) def sub(self, repl, string, count=0): return self.compile().sub(repl, string, count) def isub(self, repl, string, count=0): return self.icompile().sub(repl, string, count) def subn(self, repl, string, count=0): return self.compile().subn(repl, string, count) def isubn(self, repl, string, count=0): return self.icompile().subn(repl, string, count) def __getattr__(self, label): if label == "found": return self.lastmatch.group(0) elif label == "before": return self.lastmatch.string[:self.lastmatch.regs[0][0]] elif label == "after": return self.lastmatch.string[self.lastmatch.regs[0][1]:] elif label == "start": return lambda n=0, self=self: self.lastmatch.regs[n][0] elif label == "end": return lambda n=0, self=self: self.lastmatch.regs[n][1] elif label == "span": return lambda n=0, self=self: self.lastmatch.regs[n] elif label == "string": return self.match.string return self.lastmatch.group(label) def group(self, label=0): return self.lastmatch.group(label) class Match: """A more pleasant interface to re.MatchObject.""" def __init__(self, match): self.match = match def __repr__(self): return "" def __len__(self): return self.match.regs[0][1] - self.match.regs[0][0] def __getattr__(self, label): if label == "found": return self.match.group(0) elif label == "before": return self.match.string[:self.match.regs[0][0]] elif label == "after": return self.match.string[self.match.regs[0][1]:] elif label == "start": return lambda n=0, self=self: self.match.regs[n][0] elif label == "end": return lambda n=0, self=self: self.match.regs[n][1] elif label == "span": return lambda n=0, self=self: self.match.regs[n] elif label == "string": return self.match.string return self.match.group(label) def group(self, label=0): return self.match.group(label) def makepat(object): if type(object) == type("string"): return exactly(object) else: return object # ---------------------------------------------------- backward compatibility def compile(pattern, flags=0): regexpr = (type(pattern) == type("string")) and pattern or pattern.regex return re.compile(regexpr, flags) def icompile(pattern, flags=0): regexpr = (type(pattern) == type("string")) and pattern or pattern.regex return re.compile(regexpr, flags | re.IGNORECASE) def match(pattern, string, start=0): match = compile(pattern).match(string, start) return match and Match(match) def imatch(pattern, string, start=0): match = icompile(pattern).match(string, start) return match and Match(match) def search(pattern, string, start=0): match = compile(pattern).search(string, start) return match and Match(match) def isearch(pattern, string, start=0): match = icompile(pattern).search(string, start) return match and Match(match) def sub(pattern, repl, string, count=0): return compile(pattern).sub(repl, string, count) def isub(pattern, repl, string, count=0): return icompile(pattern).sub(repl, string, count) def subn(pattern, repl, string, count=0): return compile(pattern).subn(repl, string, count) def isubn(pattern, repl, string, count=0): return icompile(pattern).subn(repl, string, count) def split(pattern, string, pieces=0): return compile(pattern).split(string, pieces) def isplit(pattern, string, pieces=0): return icompile(pattern).split(string, pieces) # ----------------------------------------------------------------- constants letter = letters = Pattern("[A-Za-z]") digit = digits = Pattern("[0-9]") hexdigit = hexdigits = Pattern("[A-Fa-f0-9]") wordchar = wordchars = Pattern("[A-Za-z0-9_]") whitespace = Pattern("[ \t\r\n\f]") nonletter = nonletters = Pattern("[^A-Za-z]") nondigit = nondigits = Pattern("[^0-9]") nonhexdigit = nonhexdigits = Pattern("[^A-Fa-f0-9]") nonwordchar = nonwordchars = Pattern("[^A-Za-z0-9_]") nonwhitespace = Pattern("[^ \t\r\n\f]") begline = Pattern("^") endline = Pattern("$") anychar = anychars = Pattern(".") anything = Pattern(".*") something = Pattern(".+") anyspace = Pattern("[ \t\r\n\f]*") somespace = Pattern("[ \t\r\n\f]+") # --------------------------------------------------------- character classes # In a bracketed character class, only \, ], and - are special. def charclass(*chars): rightbracket = hyphen = caret = backslash = 0 result = "" for arg in chars: if id(arg) == id(letter): result = result + "A-Za-z" elif id(arg) == id(digit): result = result + "0-9" elif id(arg) == id(hexdigit): result = result + "A-Fa-f0-9" elif id(arg) == id(wordchar): result = result + "A-Za-z0-9_" elif id(arg) == id(whitespace): result = result + " \t\r\n\f" elif type(arg) != type("string"): raise error, \ "member() and nonmember() only accept string literals" else: for ch in arg: if ch == "-": hyphen = 1 elif ch == "\\": backslash = 1 elif ch == "]": rightbracket = 1 elif ch == "^": caret = 1 else: result = result + ch if hyphen: result = result + "-" # - allowed at end of class if caret: result = result + "^" # ^ allowed not at beginning if rightbracket: result = result + "\]" # ] must be escaped if backslash: result = result + "\\" # \ must be escaped return result def member(*chars): cclass = apply(charclass, chars) if cclass == "^": return Pattern("^") # special case: [^] is bad else: return Pattern("[" + apply(charclass, chars) + "]") def nonmember(*chars): return Pattern("[^" + apply(charclass, chars) + "]") members, nonmembers = member, nonmember def chrange(start, end): minord, maxord = ord(start), ord(end) if minord > maxord: minord, maxord = maxord, minord result = "" for i in range(minord, maxord+1): result = result + chr(i) return result # ------------------------------------------------------------------ escaping def exactly(literal): escaped = "" for ch in literal: if ch in "^$[]\\+*?.": escaped = escaped + "\\" + ch else: escaped = escaped + ch return Pattern(escaped) exact = exactly def anybut(literal): prefix = "" options = [] for ch in literal: if ch in "\\]": options.append(prefix + "[^\\" + ch + "]") else: options.append(prefix + "[^" + ch + "]") if ch in "^$[]\\+*?.": prefix = prefix + "\\" + ch else: prefix = prefix + ch return Pattern("(" + string.join(options, "|") + ")*") # ------------------------------------------------------ repetition operators charclassprog = re.compile("^\[\^?\]?([^]]|\\.)*\]$") parenprog = re.compile("^\$[^()]*\$$") def atomic(expr): if len(expr) == 1 and expr in string.digits + string.letters: return expr if len(expr) == 2 and expr[0] == "\\": return expr if charclassprog.match(expr) > -1: return expr if parenprog.match(expr) > -1: return expr return "(" + expr + ")" def maybe(expr): return Pattern(atomic(makepat(expr).regex) + "?") def some(expr): return Pattern(atomic(makepat(expr).regex) + "+") def any(expr): return Pattern(atomic(makepat(expr).regex) + "*") # --------------------------------------------------------------- alternation def either(*alternatives): options = [] for option in alternatives: options.append(makepat(option).regex) return Pattern("(" + string.join(options, "|") + ")") # ----------------------------------------------------- symbolic group labels class Label: def __call__(self, name, expr): return Pattern("(?P<" + name + ">" + makepat(expr).regex + ")") def __getattr__(self, name): return lambda expr, self=self, name=name: self.__call__(name, expr) label = Label() # ----------------------------------------------------------------- lookahead def followedby(expr): return Pattern("(?=" + makepat(expr).regex + ")") def notfollowedby(expr): return Pattern("(?!" + makepat(expr).regex + ")") # ---------------------- welcome() and banish() for moving into other modules exports = ["chrange", "exact", "exactly", "anybut", "member", "members", "nonmember", "nonmembers", "maybe", "some", "any", "either", "label", "followedby", "notfollowedby", "letter", "letters", "nonletter", "nonletters", "digit", "digits", "nondigit", "nondigits", "hexdigit", "hexdigits", "nonhexdigit", "nonhexdigits", "wordchar", "wordchars", "nonwordchar", "nonwordchars", "anychar", "anychars", "anything", "something", "whitespace", "nonwhitespace", "anyspace", "somespace", "begline", "endline"] displaced = {} import __main__, sys def callermodule(): frame = None module = None try: 1/0 except: frame = sys.exc_traceback.tb_frame while frame: module = frame.f_globals["__name__"] if module != __name__: break frame = frame.f_back return module and sys.modules[module] def welcome(target = None): global displaced if not target: target = callermodule() elif type(target) == type("string"): target = sys.modules[target] if displaced.has_key(target.__name__): raise error, "welcome(): already resident in " + repr(target) tdict = target.__dict__ tsave = displaced[target.__name__] = {} source = globals() for name in exports: if tdict.has_key(name): tsave[name] = tdict[name] tdict[name] = source[name] def banish(target = None): global displaced if not target: target = callermodule() elif type(target) == type("string"): target = sys.modules[target] if not displaced.has_key(target.__name__): raise error, "banish(): not currently resident in " + repr(target) tdict = target.__dict__ tsave = displaced[target.__name__] for name in exports: if tsave.has_key(name): tdict[name] = tsave[name] elif tdict.has_key(name): del tdict[name] del displaced[target.__name__]/jon@dstc.edu.au /rael@dejanews.com