# # Secret Labs' Regular Expression Engine core module # # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. # # This version of the SRE library can be redistributed under CNRI's # Python 1.6 license. For any other use, please contact Secret Labs # AB (info@pythonware.com). # # Portions of this engine have been developed in cooperation with # CNRI. Hewlett-Packard provided funding for 1.6 integration and # other compatibility work. # # 2010-01-16 mrab Python front-end re-written and extended import string import sys import unicodedata from collections import defaultdict if sys.version_info < (2, 6): from Python25 import _regex elif sys.version_info < (2, 7): from Python26 import _regex else: from Python27 import _regex __all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", "Scanner"] # The regex exception. class error(Exception): def __init__(self, message, set_error=False): Exception.__init__(self, message) self.set_error = set_error # The exception for when a positional flag has been turned on in the old # behaviour. class _UnscopedFlagSet(Exception): pass # The exception for when parsing fails and we want to try something else. class ParseError(Exception): pass # The exception for when there isn't a valid first set. class _FirstSetError(Exception): pass # Flags. A = ASCII = 0x80 # Assume ASCII locale. B = BESTMATCH = 0x1000 # Best fuzzy match. D = DEBUG = 0x200 # Print parsed pattern. E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first # fuzzy match. F = FULLCASE = 0x4000 # Unicode full case-folding. I = IGNORECASE = 0x2 # Ignore case. L = LOCALE = 0x4 # Assume current 8-bit locale. M = MULTILINE = 0x8 # Make anchors look for newline. R = REVERSE = 0x400 # Search backwards. S = DOTALL = 0x10 # Make dot match newline. U = UNICODE = 0x20 # Assume Unicode locale. V0 = VERSION0 = 0x2000 # Old legacy behaviour. V1 = VERSION1 = 0x100 # New enhanced behaviour. W = WORD = 0x800 # Default Unicode word breaks. X = VERBOSE = 0x40 # Ignore whitespace and comments. T = TEMPLATE = 0x1 # Template (present because re module has it). DEFAULT_VERSION = VERSION1 _ALL_VERSIONS = VERSION0 | VERSION1 _ALL_ENCODINGS = ASCII | LOCALE | UNICODE # The default flags for the various versions. DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} # The mask for the flags. GLOBAL_FLAGS = (_ALL_ENCODINGS | _ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | REVERSE) SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE ALPHA = frozenset(string.ascii_letters) DIGITS = frozenset(string.digits) ALNUM = ALPHA | DIGITS OCT_DIGITS = frozenset(string.octdigits) HEX_DIGITS = frozenset(string.hexdigits) SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) NAMED_CHAR_PART = ALNUM | frozenset(" -") PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") SET_OPS = ("||", "~~", "&&", "--") # The width of the code words inside the regex engine. BYTES_PER_CODE = _regex.get_code_size() BITS_PER_CODE = BYTES_PER_CODE * 8 # The repeat count which represents infinity. UNLIMITED = (1 << BITS_PER_CODE) - 1 # The regular expression flags. REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "r": REVERSE, "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": VERBOSE} # The case flags. CASE_FLAGS = FULLCASE | IGNORECASE NOCASE = 0 FULLIGNORECASE = FULLCASE | IGNORECASE FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE # The number of digits in hexadecimal escapes. HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} # A singleton which indicates a comment within a pattern. COMMENT = object() FLAGS = object() # The names of the opcodes. OPCODES = """ FAILURE SUCCESS ANY ANY_ALL ANY_ALL_REV ANY_REV ANY_U ANY_U_REV ATOMIC BOUNDARY BRANCH CALL_REF CHARACTER CHARACTER_IGN CHARACTER_IGN_REV CHARACTER_REV DEFAULT_BOUNDARY DEFAULT_END_OF_WORD DEFAULT_START_OF_WORD END END_OF_LINE END_OF_LINE_U END_OF_STRING END_OF_STRING_LINE END_OF_STRING_LINE_U END_OF_WORD FUZZY GRAPHEME_BOUNDARY GREEDY_REPEAT GROUP GROUP_CALL GROUP_EXISTS LAZY_REPEAT LOOKAROUND NEXT PROPERTY PROPERTY_IGN PROPERTY_IGN_REV PROPERTY_REV RANGE RANGE_IGN RANGE_IGN_REV RANGE_REV REF_GROUP REF_GROUP_FLD REF_GROUP_FLD_REV REF_GROUP_IGN REF_GROUP_IGN_REV REF_GROUP_REV SEARCH_ANCHOR SET_DIFF SET_DIFF_IGN SET_DIFF_IGN_REV SET_DIFF_REV SET_INTER SET_INTER_IGN SET_INTER_IGN_REV SET_INTER_REV SET_SYM_DIFF SET_SYM_DIFF_IGN SET_SYM_DIFF_IGN_REV SET_SYM_DIFF_REV SET_UNION SET_UNION_IGN SET_UNION_IGN_REV SET_UNION_REV START_OF_LINE START_OF_LINE_U START_OF_STRING START_OF_WORD STRING STRING_FLD STRING_FLD_REV STRING_IGN STRING_IGN_REV STRING_REV STRING_SET STRING_SET_FLD STRING_SET_FLD_REV STRING_SET_IGN STRING_SET_IGN_REV STRING_SET_REV """ # Define the opcodes in a namespace. class Namespace(object): pass OP = Namespace() for i, op in enumerate(OPCODES.split()): setattr(OP, op, i) def _shrink_cache(cache_dict, args_dict, max_length, divisor=5): """Make room in the given cache. Args: cache_dict: The cache dictionary to modify. args_dict: The dictionary of named list args used by patterns. max_length: Maximum # of entries in cache_dict before it is shrunk. divisor: Cache will shrink to max_length - 1/divisor*max_length items. """ # Toss out a fraction of the entries at random to make room for new ones. # A random algorithm was chosen as opposed to simply cache_dict.popitem() # as popitem could penalize the same regular expression repeatedly based # on its internal hash value. Being random should spread the cache miss # love around. cache_keys = tuple(cache_dict.keys()) overage = len(cache_keys) - max_length if overage < 0: # Cache is already within limits. Normally this should not happen # but it could due to multithreading. return number_to_toss = max_length // divisor + overage # The import is done here to avoid a circular dependency. import random if not hasattr(random, 'sample'): # Do nothing while resolving the circular dependency: # re->random->warnings->tokenize->string->re return for doomed_key in random.sample(cache_keys, number_to_toss): try: del cache_dict[doomed_key] except KeyError: # Ignore problems if the cache changed from another thread. pass # Rebuild the arguments dictionary. args_dict.clear() for pattern, pattern_type, flags, args, default_version in cache_dict: args_dict[pattern, pattern_type, flags, default_version] = args def _fold_case(info, string): "Folds the case of a string." flags = info.flags if (flags & _ALL_ENCODINGS) == 0: flags |= info.guess_encoding return _regex.fold_case(flags, string) def is_cased(info, char): "Checks whether a character is cased." return len(_regex.get_all_cases(info.flags, char)) > 1 def _compile_firstset(info, fs): "Compiles the firstset for the pattern." if not fs or None in fs: return [] # If we ignore the case, for simplicity we won't build a firstset. members = set() for i in fs: if i.case_flags: if isinstance(i, Character): if is_cased(info, i.value): return [] elif isinstance(i, SetBase): return [] members.add(i.with_flags(case_flags=NOCASE)) # Build the firstset. fs = SetUnion(info, list(members), zerowidth=True) fs = fs.optimise(info, in_set=True) # Compile the firstset. return fs.compile(bool(info.flags & REVERSE)) def _flatten_code(code): "Flattens the code from a list of tuples." flat_code = [] for c in code: flat_code.extend(c) return flat_code def make_character(info, value, in_set=False): "Makes a character literal." if in_set: # A character set is built case-sensitively. return Character(value) return Character(value, case_flags=info.flags & CASE_FLAGS) def make_ref_group(info, name, position): "Makes a group reference." return RefGroup(info, name, position, case_flags=info.flags & CASE_FLAGS) def make_string_set(info, name): "Makes a string set." return StringSet(info, name, case_flags=info.flags & CASE_FLAGS) def make_property(info, prop, in_set): "Makes a property." if in_set: return prop return prop.with_flags(case_flags=info.flags & CASE_FLAGS) def _parse_pattern(source, info): "Parses a pattern, eg. 'a|b|c'." branches = [parse_sequence(source, info)] while source.match("|"): branches.append(parse_sequence(source, info)) if len(branches) == 1: return branches[0] return Branch(branches) def parse_sequence(source, info): "Parses a sequence, eg. 'abc'." sequence = [] applied = False while True: # Get literal characters followed by an element. characters, case_flags, element = parse_literal_and_element(source, info) if not element: # No element, just a literal. We've also reached the end of the # sequence. append_literal(characters, case_flags, sequence) break if element is COMMENT or element is FLAGS: append_literal(characters, case_flags, sequence) elif type(element) is tuple: # It looks like we've found a quantifier. ch, saved_pos = element counts = parse_quantifier(source, info, ch) if counts: # It _is_ a quantifier. apply_quantifier(source, info, counts, characters, case_flags, ch, saved_pos, applied, sequence) applied = True else: # It's not a quantifier. Maybe it's a fuzzy constraint. constraints = parse_fuzzy(source, ch) if constraints: # It _is_ a fuzzy constraint. apply_constraint(source, info, constraints, characters, case_flags, saved_pos, applied, sequence) applied = True else: # The element was just a literal. characters.append(ord(ch)) append_literal(characters, case_flags, sequence) applied = False else: # We have a literal followed by something else. append_literal(characters, case_flags, sequence) sequence.append(element) applied = False return make_sequence(sequence) def apply_quantifier(source, info, counts, characters, case_flags, ch, saved_pos, applied, sequence): if characters: # The quantifier applies to the last character. append_literal(characters[ : -1], case_flags, sequence) element = Character(characters[-1], case_flags=case_flags) else: # The quantifier applies to the last item in the sequence. if applied or not sequence: raise error("nothing to repeat at position %d" % saved_pos) element = sequence.pop() min_count, max_count = counts saved_pos = source.pos ch = source.get() if ch == "?": # The "?" suffix that means it's a lazy repeat. repeated = LazyRepeat elif ch == "+": # The "+" suffix that means it's a possessive repeat. repeated = PossessiveRepeat else: # No suffix means that it's a greedy repeat. source.pos = saved_pos repeated = GreedyRepeat # Ignore the quantifier if it applies to a zero-width item or the number of # repeats is fixed at 1. if not element.is_empty() and (min_count != 1 or max_count != 1): element = repeated(element, min_count, max_count) sequence.append(element) def apply_constraint(source, info, constraints, characters, case_flags, saved_pos, applied, sequence): if characters: # The constraint applies to the last character. append_literal(characters[ : -1], case_flags, sequence) element = Character(characters[-1], case_flags=case_flags) sequence.append(Fuzzy(element, constraints)) else: # The constraint applies to the last item in the sequence. if applied or not sequence: raise error("nothing for fuzzy constraint at position %d" % saved_pos) element = sequence.pop() # If a group is marked as fuzzy then put all of the fuzzy part in the # group. if isinstance(element, Group): element.subpattern = Fuzzy(element.subpattern, constraints) sequence.append(element) else: sequence.append(Fuzzy(element, constraints)) def append_literal(characters, case_flags, sequence): if characters: sequence.append(Literal(characters, case_flags=case_flags)) def PossessiveRepeat(element, min_count, max_count): "Builds a possessive repeat." return Atomic(GreedyRepeat(element, min_count, max_count)) _QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} def parse_quantifier(source, info, ch): "Parses a quantifier." q = _QUANTIFIERS.get(ch) if q: # It's a quantifier. return q if ch == "{": # Looks like a limited repeated element, eg. 'a{2,3}'. counts = parse_limited_quantifier(source) if counts: return counts return None def is_above_limit(count): "Checks whether a count is above the maximum." return count is not None and count >= UNLIMITED def parse_limited_quantifier(source): "Parses a limited quantifier." saved_pos = source.pos min_count = parse_count(source) if source.match(","): max_count = parse_count(source) # No minimum means 0 and no maximum means unlimited. min_count = int(min_count or 0) max_count = int(max_count) if max_count else None if max_count is not None and min_count > max_count: raise error("min repeat greater than max repeat at position %d" % saved_pos) else: if not min_count: source.pos = saved_pos return None min_count = max_count = int(min_count) if is_above_limit(min_count) or is_above_limit(max_count): raise error("repeat count too big at position %d" % saved_pos) if not source.match ("}"): source.pos = saved_pos return None return min_count, max_count def parse_fuzzy(source, ch): "Parses a fuzzy setting, if present." if ch != "{": return None saved_pos = source.pos constraints = {} try: parse_fuzzy_item(source, constraints) while source.match(","): parse_fuzzy_item(source, constraints) except ParseError: source.pos = saved_pos return None if not source.match("}"): raise error("expected } at position %d" % source.pos) return constraints def parse_fuzzy_item(source, constraints): "Parses a fuzzy setting item." saved_pos = source.pos try: parse_cost_constraint(source, constraints) except ParseError: source.pos = saved_pos parse_cost_equation(source, constraints) def parse_cost_constraint(source, constraints): "Parses a cost constraint." saved_pos = source.pos ch = source.get() if ch in ALPHA: # Syntax: constraint [("<=" | "<") cost] constraint = parse_constraint(source, constraints, ch) max_inc = parse_fuzzy_compare(source) if max_inc is None: # No maximum cost. constraints[constraint] = 0, None else: # There's a maximum cost. cost_pos = source.pos max_cost = int(parse_count(source)) # Inclusive or exclusive limit? if not max_inc: max_cost -= 1 if max_cost < 0: raise error("bad fuzzy cost limit at position %d" % cost_pos) constraints[constraint] = 0, max_cost elif ch in DIGITS: # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost source.pos = saved_pos try: # Minimum cost. min_cost = int(parse_count(source)) min_inc = parse_fuzzy_compare(source) if min_inc is None: raise ParseError() constraint = parse_constraint(source, constraints, source.get()) max_inc = parse_fuzzy_compare(source) if max_inc is None: raise ParseError() # Maximum cost. cost_pos = source.pos max_cost = int(parse_count(source)) # Inclusive or exclusive limits? if not min_inc: min_cost += 1 if not max_inc: max_cost -= 1 if not 0 <= min_cost <= max_cost: raise error("bad fuzzy cost limit at position %d" % cost_pos) constraints[constraint] = min_cost, max_cost except ValueError: raise ParseError() else: raise ParseError() def parse_constraint(source, constraints, ch): "Parses a constraint." if ch not in "deis": raise error("bad fuzzy constraint at position %d" % source.pos) if ch in constraints: raise error("repeated fuzzy constraint at position %d" % source.pos) return ch def parse_fuzzy_compare(source): "Parses a cost comparator." if source.match("<="): return True elif source.match("<"): return False else: return None def parse_cost_equation(source, constraints): "Parses a cost equation." if "cost" in constraints: raise error("more than one cost equation at position %d" % source.pos) cost = {} parse_cost_term(source, cost) while source.match("+"): parse_cost_term(source, cost) max_inc = parse_fuzzy_compare(source) if max_inc is None: raise error("missing fuzzy cost limit at position %d" % source.pos) max_cost = int(parse_count(source)) if not max_inc: max_cost -= 1 if max_cost < 0: raise error("bad fuzzy cost limit at position %d" % source.pos) cost["max"] = max_cost constraints["cost"] = cost def parse_cost_term(source, cost): "Parses a cost equation term." coeff = parse_count(source) ch = source.get() if ch not in "dis": raise ParseError() if ch in cost: raise error("repeated fuzzy cost at position %d" % source.pos) cost[ch] = int(coeff or 1) def parse_count(source): "Parses a quantifier's count, which can be empty." return source.get_while(DIGITS) def parse_literal_and_element(source, info): """Parses a literal followed by an element. The element is FLAGS if it's an inline flag or None if it has reached the end of a sequence. """ characters = [] case_flags = info.flags & CASE_FLAGS while True: saved_pos = source.pos ch = source.get() if ch in SPECIAL_CHARS: if ch in ")|": # The end of a sequence. At the end of the pattern ch is "". source.pos = saved_pos return characters, case_flags, None elif ch == "\\": # An escape sequence outside a set. element = parse_escape(source, info, False) return characters, case_flags, element elif ch == "(": # A parenthesised subpattern or a flag. element = parse_paren(source, info) if element and element is not COMMENT: return characters, case_flags, element elif ch == ".": # Any character. if info.flags & DOTALL: element = AnyAll() elif info.flags & WORD: element = AnyU() else: element = Any() return characters, case_flags, element elif ch == "[": # A character set. element = parse_set(source, info) return characters, case_flags, element elif ch == "^": # The start of a line or the string. if info.flags & MULTILINE: if info.flags & WORD: element = StartOfLineU() else: element = StartOfLine() else: element = StartOfString() return characters, case_flags, element elif ch == "$": # The end of a line or the string. if info.flags & MULTILINE: if info.flags & WORD: element = EndOfLineU() else: element = EndOfLine() else: if info.flags & WORD: element = EndOfStringLineU() else: element = EndOfStringLine() return characters, case_flags, element elif ch in "?*+{": # Looks like a quantifier. return characters, case_flags, (ch, saved_pos) else: # A literal. characters.append(ord(ch)) else: # A literal. characters.append(ord(ch)) def parse_paren(source, info): """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an inline flag. """ saved_pos = source.pos ch = source.get() if ch == "?": # (?... saved_pos_2 = source.pos ch = source.get() if ch == "<": # (?<... saved_pos_3 = source.pos ch = source.get() if ch in ("=", "!"): # (?<=... or (?") saved_flags = info.flags try: subpattern = _parse_pattern(source, info) source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) info.close_group() return Group(info, group, subpattern) if ch in ("=", "!"): # (?=... or (?!...: lookahead. return parse_lookaround(source, info, False, ch == "=") if ch == "P": # (?P...: a Python extension. return parse_extension(source, info) if ch == "#": # (?#...: a comment. return parse_comment(source) if ch == "(": # (?(...: a conditional subpattern. return parse_conditional(source, info) if ch == ">": # (?>...: an atomic subpattern. return parse_atomic(source, info) if ch == "|": # (?|...: a common/reset groups branch. return parse_common(source, info) if ch == "R" or "0" <= ch <= "9": # (?R...: probably a call to a group. return parse_call_group(source, info, ch, saved_pos_2) if ch == "&": # (?&...: a call to a named group. return parse_call_named_group(source, info, saved_pos_2) # (?...: probably a flags subpattern. source.pos = saved_pos_2 return parse_flags_subpattern(source, info) # (...: an unnamed capture group. source.pos = saved_pos group = info.open_group() saved_flags = info.flags try: subpattern = _parse_pattern(source, info) source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) info.close_group() return Group(info, group, subpattern) def parse_extension(source, info): "Parses a Python extension." saved_pos = source.pos ch = source.get() if ch == "<": # (?P<...: a named capture group. name = parse_name(source) group = info.open_group(name) source.expect(">") saved_flags = info.flags try: subpattern = _parse_pattern(source, info) source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) info.close_group() return Group(info, group, subpattern) if ch == "=": # (?P=...: a named group reference. name = parse_name(source) source.expect(")") if info.is_open_group(name): raise error("can't refer to an open group at position %d" % saved_pos) return make_ref_group(info, name, saved_pos) if ch == ">" or ch == "&": # (?P>...: a call to a group. return parse_call_named_group(source, info, saved_pos) source.pos = saved_pos raise error("unknown extension at position %d" % saved_pos) def parse_comment(source): "Parses a comment." source.skip_while(set(")"), include=False) source.expect(")") return COMMENT def parse_lookaround(source, info, behind, positive): "Parses a lookaround." saved_flags = info.flags try: subpattern = _parse_pattern(source, info) source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) return LookAround(behind, positive, subpattern) def parse_conditional(source, info): "Parses a conditional subpattern." saved_flags = info.flags saved_pos = source.pos try: group = parse_name(source, True) source.expect(")") yes_branch = parse_sequence(source, info) if source.match("|"): no_branch = parse_sequence(source, info) else: no_branch = Sequence() source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) if yes_branch.is_empty() and no_branch.is_empty(): return Sequence() return Conditional(info, group, yes_branch, no_branch, saved_pos) def parse_atomic(source, info): "Parses an atomic subpattern." saved_flags = info.flags try: subpattern = _parse_pattern(source, info) source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) return Atomic(subpattern) def parse_common(source, info): "Parses a common groups branch." # Capture group numbers in different branches can reuse the group numbers. initial_group_count = info.group_count branches = [parse_sequence(source, info)] final_group_count = info.group_count while source.match("|"): info.group_count = initial_group_count branches.append(parse_sequence(source, info)) final_group_count = max(final_group_count, info.group_count) info.group_count = final_group_count source.expect(")") if len(branches) == 1: return branches[0] return Branch(branches) def parse_call_group(source, info, ch, pos): "Parses a call to a group." if ch == "R": group = "0" else: group = ch + source.get_while(DIGITS) source.expect(")") return CallGroup(info, group, pos) def parse_call_named_group(source, info, pos): "Parses a call to a named group." group = parse_name(source) source.expect(")") return CallGroup(info, group, pos) def parse_flag_set(source): "Parses a set of inline flags." flags = 0 try: while True: saved_pos = source.pos ch = source.get() if ch == "V": ch += source.get() flags |= REGEX_FLAGS[ch] except KeyError: source.pos = saved_pos return flags def parse_flags(source, info): "Parses flags being turned on/off." flags_on = parse_flag_set(source) if source.match("-"): flags_off = parse_flag_set(source) if not flags_off: raise error("bad inline flags: no flags after '-' at position %d" % source.pos) else: flags_off = 0 return flags_on, flags_off def parse_subpattern(source, info, flags_on, flags_off): "Parses a subpattern with scoped flags." saved_flags = info.flags info.flags = (info.flags | flags_on) & ~flags_off source.ignore_space = bool(info.flags & VERBOSE) try: subpattern = _parse_pattern(source, info) source.expect(")") finally: info.flags = saved_flags source.ignore_space = bool(info.flags & VERBOSE) return subpattern def parse_flags_subpattern(source, info): """Parses a flags subpattern. It could be inline flags or a subpattern possibly with local flags. If it's a subpattern, then that's returned; if it's a inline flags, then FLAGS is returned. """ flags_on, flags_off = parse_flags(source, info) if flags_off & GLOBAL_FLAGS: raise error("bad inline flags: can't turn off global flag at position %d" % source.pos) if flags_on & flags_off: raise error("bad inline flags: flag turned on and off at position %d" % source.pos) # Handle flags which are global in all regex behaviours. new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS if new_global_flags: info.global_flags |= new_global_flags # A global has been turned on, so reparse the pattern. raise _UnscopedFlagSet(info.global_flags) # Ensure that from now on we have only scoped flags. flags_on &= ~GLOBAL_FLAGS if source.match(":"): return parse_subpattern(source, info, flags_on, flags_off) if source.match(")"): parse_positional_flags(source, info, flags_on, flags_off) return FLAGS raise error("unknown extension at position %d" % source.pos) def parse_positional_flags(source, info, flags_on, flags_off): "Parses positional flags." version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION if version == VERSION0: # Positional flags are global and can only be turned on. if flags_off: raise error("bad inline flags: can't turn flags off at position %d" % source.pos) new_global_flags = flags_on & ~info.global_flags if new_global_flags: info.global_flags |= new_global_flags # A global has been turned on, so reparse the pattern. raise _UnscopedFlagSet(info.global_flags) else: info.flags = (info.flags | flags_on) & ~flags_off source.ignore_space = bool(info.flags & VERBOSE) def parse_name(source, allow_numeric=False): "Parses a name." name = source.get_while(set(")>"), include=False) if not name: raise error("bad group name at position %d" % source.pos) if name.isdigit(): if not allow_numeric: raise error("bad group name at position %d" % source.pos) else: if not is_identifier(name): raise error("bad group name at position %d" % source.pos) return name def is_identifier(name): if not name: return False if name[0] not in ALPHA and name[0] != "_": return False name = name.replace("_", "") return not name or all(c in ALNUM for c in name) def is_octal(string): "Checks whether a string is octal." return all(ch in OCT_DIGITS for ch in string) def is_decimal(string): "Checks whether a string is decimal." return all(ch in DIGITS for ch in string) def is_hexadecimal(string): "Checks whether a string is hexadecimal." return all(ch in HEX_DIGITS for ch in string) def parse_escape(source, info, in_set): "Parses an escape sequence." saved_ignore = source.ignore_space source.ignore_space = False ch = source.get() source.ignore_space = saved_ignore if not ch: # A backslash at the end of the pattern. raise error("bad escape at position %d" % source.pos) if ch in HEX_ESCAPES: # A hexadecimal escape sequence. return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set) elif ch == "g" and not in_set: # A group reference. saved_pos = source.pos try: return parse_group_ref(source, info) except error: # Invalid as a group reference, so assume it's a literal. source.pos = saved_pos return make_character(info, ord(ch), in_set) elif ch == "G" and not in_set: # A search anchor. return SearchAnchor() elif ch == "L" and not in_set: # A string set. return parse_string_set(source, info) elif ch == "N": # A named codepoint. return parse_named_char(source, info, in_set) elif ch in "pP": # A Unicode property, positive or negative. return parse_property(source, info, ch == "p", in_set) elif ch == "X" and not in_set: # A grapheme cluster. return Grapheme() elif ch in ALPHA: # An alphabetic escape sequence. # Positional escapes aren't allowed inside a character set. if not in_set: if info.flags & WORD: value = WORD_POSITION_ESCAPES.get(ch) else: value = POSITION_ESCAPES.get(ch) if value: return value value = CHARSET_ESCAPES.get(ch) if value: return value value = CHARACTER_ESCAPES.get(ch) if value: return Character(ord(value)) return make_character(info, ord(ch), in_set) elif ch in DIGITS: # A numeric escape sequence. return parse_numeric_escape(source, info, ch, in_set) else: # A literal. return make_character(info, ord(ch), in_set) def parse_numeric_escape(source, info, ch, in_set): "Parses a numeric escape sequence." if in_set or ch == "0": # Octal escape sequence, max 3 digits. return parse_octal_escape(source, info, [ch], in_set) # At least 1 digit, so either octal escape or group. digits = ch saved_pos = source.pos ch = source.get() if ch in DIGITS: # At least 2 digits, so either octal escape or group. digits += ch saved_pos = source.pos ch = source.get() if is_octal(digits) and ch in OCT_DIGITS: # 3 octal digits, so octal escape sequence. encoding = info.flags & _ALL_ENCODINGS if encoding == ASCII or encoding == LOCALE: octal_mask = 0xFF else: octal_mask = 0x1FF value = int(digits + ch, 8) & octal_mask return make_character(info, value) # Group reference. source.pos = saved_pos if info.is_open_group(digits): raise error("can't refer to an open group at position %d" % source.pos) return make_ref_group(info, digits, source.pos) def parse_octal_escape(source, info, digits, in_set): "Parses an octal escape sequence." saved_pos = source.pos ch = source.get() while len(digits) < 3 and ch in OCT_DIGITS: digits.append(ch) saved_pos = source.pos ch = source.get() source.pos = saved_pos try: value = int("".join(digits), 8) return make_character(info, value, in_set) except ValueError: raise error("bad octal escape at position %d" % source.pos) def parse_hex_escape(source, info, expected_len, in_set): "Parses a hex escape sequence." digits = [] for i in range(expected_len): ch = source.get() if ch not in HEX_DIGITS: raise error("bad hex escape at position %d" % source.pos) digits.append(ch) value = int("".join(digits), 16) return make_character(info, value, in_set) def parse_group_ref(source, info): "Parses a group reference." source.expect("<") saved_pos = source.pos name = parse_name(source, True) source.expect(">") if info.is_open_group(name): raise error("can't refer to an open group at position %d" % source.pos) return make_ref_group(info, name, saved_pos) def parse_string_set(source, info): "Parses a string set reference." source.expect("<") name = parse_name(source, True) source.expect(">") if name is None or name not in info.kwargs: raise error("undefined named list at position %d" % source.pos) return make_string_set(info, name) def parse_named_char(source, info, in_set): "Parses a named character." saved_pos = source.pos if source.match("{"): name = source.get_while(NAMED_CHAR_PART) if source.match("}"): try: value = unicodedata.lookup(name) return make_character(info, ord(value), in_set) except KeyError: raise error("undefined character name at position %d" % source.pos) source.pos = saved_pos return make_character(info, ord("N"), in_set) def parse_property(source, info, positive, in_set): "Parses a Unicode property." saved_pos = source.pos ch = source.get() if ch == "{": negate = source.match("^") prop_name, name = parse_property_name(source) if source.match("}"): # It's correctly delimited. prop = lookup_property(prop_name, name, positive != negate, source_pos=source.pos) return make_property(info, prop, in_set) elif ch and ch in "CLMNPSZ": # An abbreviated property, eg \pL. prop = lookup_property(None, ch, positive) return make_property(info, prop, in_set, source_pos=source.pos) # Not a property, so treat as a literal "p" or "P". source.pos = saved_pos ch = "p" if positive else "P" return make_character(info, ord(ch), in_set) def parse_property_name(source): "Parses a property name, which may be qualified." name = source.get_while(PROPERTY_NAME_PART) saved_pos = source.pos ch = source.get() if ch and ch in ":=": prop_name = name name = source.get_while(ALNUM | set(" &_-./")).strip() if name: # Name after the ":" or "=", so it's a qualified name. saved_pos = source.pos else: # No name after the ":" or "=", so assume it's an unqualified name. prop_name, name = None, prop_name else: prop_name = None source.pos = saved_pos return prop_name, name def parse_set(source, info): "Parses a character set." version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION saved_ignore = source.ignore_space source.ignore_space = False # Negative set? negate = source.match("^") try: if version == VERSION0: item = parse_set_imp_union(source, info) else: item = parse_set_union(source, info) if not source.match("]"): raise error("missing ] at position %d" % source.pos) finally: source.ignore_space = saved_ignore if negate: item = item.with_flags(positive=not item.positive) item = item.with_flags(case_flags=info.flags & CASE_FLAGS) return item def parse_set_union(source, info): "Parses a set union ([x||y])." items = [parse_set_symm_diff(source, info)] while source.match("||"): items.append(parse_set_symm_diff(source, info)) if len(items) == 1: return items[0] return SetUnion(info, items) def parse_set_symm_diff(source, info): "Parses a set symmetric difference ([x~~y])." items = [parse_set_inter(source, info)] while source.match("~~"): items.append(parse_set_inter(source, info)) if len(items) == 1: return items[0] return SetSymDiff(info, items) def parse_set_inter(source, info): "Parses a set intersection ([x&&y])." items = [parse_set_diff(source, info)] while source.match("&&"): items.append(parse_set_diff(source, info)) if len(items) == 1: return items[0] return SetInter(info, items) def parse_set_diff(source, info): "Parses a set difference ([x--y])." items = [parse_set_imp_union(source, info)] while source.match("--"): items.append(parse_set_imp_union(source, info)) if len(items) == 1: return items[0] return SetDiff(info, items) def parse_set_imp_union(source, info): "Parses a set implicit union ([xy])." version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION items = [parse_set_member(source, info)] while True: saved_pos = source.pos if source.match("]"): # End of the set. source.pos = saved_pos break if version == VERSION1 and any(source.match(op) for op in SET_OPS): # The new behaviour has set operators. source.pos = saved_pos break items.append(parse_set_member(source, info)) if len(items) == 1: return items[0] return SetUnion(info, items) def parse_set_member(source, info): "Parses a member in a character set." # Parse a set item. start = parse_set_item(source, info) if (not isinstance(start, Character) or not start.positive or not source.match("-")): # It's not the start of a range. return start # It looks like the start of a range of characters. saved_pos = source.pos if source.match("]"): # We've reached the end of the set, so return both the character and # hyphen. source.pos = saved_pos return SetUnion(info, [start, Character(ord("-"))]) # Parse a set item. end = parse_set_item(source, info) if not isinstance(end, Character) or not end.positive: # It's not a range, so return the character, hyphen and property. return SetUnion(info, [start, Character(ord("-")), end]) # It _is_ a range. if start.value > end.value: raise error("bad character range at position %d" % source.pos) if start.value == end.value: return start return Range(start.value, end.value) def parse_set_item(source, info): "Parses an item in a character set." version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION if source.match("\\"): # An escape sequence in a set. return parse_escape(source, info, True) saved_pos = source.pos if source.match("[:"): # Looks like a POSIX character class. try: return parse_posix_class(source, info) except ParseError: # Not a POSIX character class. source.pos = saved_pos if version == VERSION1 and source.match("["): # It's the start of a nested set. # Negative set? negate = source.match("^") item = parse_set_union(source, info) if not source.match("]"): raise error("missing ] at position %d" % source.pos) if negate: item = item.with_flags(positive=not item.positive) return item ch = source.get() if not ch: raise error("bad set at position %d" % source.pos, True) return Character(ord(ch)) def parse_posix_class(source, info): "Parses a POSIX character class." negate = source.match("^") prop_name, name = parse_property_name(source) if not source.match(":]"): raise ParseError() return lookup_property(prop_name, name, positive=not negate, source_pos=source.pos) def float_to_rational(flt): "Converts a float to a rational pair." int_part = int(flt) error = flt - int_part if abs(error) < 0.0001: return int_part, 1 den, num = float_to_rational(1.0 / error) return int_part * den + num, den def numeric_to_rational(numeric): "Converts a numeric string to a rational string, if possible." if numeric[0] == "-": sign, numeric = numeric[0], numeric[1 : ] else: sign = "" parts = numeric.split("/") if len(parts) == 2: num, den = float_to_rational(float(parts[0]) / float(parts[1])) elif len(parts) == 1: num, den = float_to_rational(float(parts[0])) else: raise ValueError() result = "%s%s/%s" % (sign, num, den) if result.endswith("/1"): return result[ : -2] return result def standardise_name(name): "Standardises a property or value name." try: return numeric_to_rational("".join(name)) except (ValueError, ZeroDivisionError): return "".join(ch for ch in name if ch not in "_- ").upper() def lookup_property(property, value, positive, source_pos=None): "Looks up a property." # Normalise the names (which may still be lists). property = standardise_name(property) if property else None value = standardise_name(value) if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive if property: # Both the property and the value are provided. prop = PROPERTIES.get(property) if not prop: raise error("unknown property at position %d" % source_pos) prop_id, value_dict = prop val_id = value_dict.get(value) if val_id is None: raise error("unknown property value at position %d" % source_pos) if "YES" in value_dict and val_id == 0: positive, val_id = not positive, 1 return Property((prop_id << 16) | val_id, positive) # Only the value is provided. # It might be the name of a GC, script or block value. for property in ("GC", "SCRIPT", "BLOCK"): prop_id, value_dict = PROPERTIES.get(property) val_id = value_dict.get(value) if val_id is not None: return Property((prop_id << 16) | val_id, positive) # It might be the name of a binary property. prop = PROPERTIES.get(value) if prop: prop_id, value_dict = prop if "YES" in value_dict: return Property((prop_id << 16) | 1, positive) # It might be the name of a binary property starting with a prefix. if value.startswith("IS"): prop = PROPERTIES.get(value[2 : ]) if prop: prop_id, value_dict = prop if "YES" in value_dict: return Property((prop_id << 16) | 1, positive) # It might be the name of a script or block starting with a prefix. for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): if value.startswith(prefix): prop_id, value_dict = PROPERTIES.get(property) val_id = value_dict.get(value[2 : ]) if val_id is not None: return Property((prop_id << 16) | val_id, positive) # Unknown property. raise error("unknown property at position %d" % source_pos) def _compile_replacement(source, pattern, is_unicode): "Compiles a replacement template escape sequence." ch = source.get() if ch in ALPHA: # An alphabetic escape sequence. value = CHARACTER_ESCAPES.get(ch) if value: return False, [ord(value)] if ch in HEX_ESCAPES and (ch == "x" or is_unicode): # A hexadecimal escape sequence. return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch])] if ch == "g": # A group preference. return True, [compile_repl_group(source, pattern)] if ch == "N" and is_unicode: # A named character. value = parse_repl_named_char(source) if value is not None: return False, [value] return False, [ord("\\"), ord(ch)] if isinstance(source.sep, str): octal_mask = 0xFF else: octal_mask = 0x1FF if ch == "0": # An octal escape sequence. digits = ch while len(digits) < 3: saved_pos = source.pos ch = source.get() if ch not in OCT_DIGITS: source.pos = saved_pos break digits += ch return False, [int(digits, 8) & octal_mask] if ch in DIGITS: # Either an octal escape sequence (3 digits) or a group reference (max # 2 digits). digits = ch saved_pos = source.pos ch = source.get() if ch in DIGITS: digits += ch saved_pos = source.pos ch = source.get() if ch and is_octal(digits + ch): # An octal escape sequence. return False, [int(digits + ch, 8) & octal_mask] # A group reference. source.pos = saved_pos return True, [int(digits)] if ch == "\\": # An escaped backslash is a backslash. return False, [ord("\\")] if not ch: # A trailing backslash. raise error("bad escape at position %d" % source.pos) # An escaped non-backslash is a backslash followed by the literal. return False, [ord("\\"), ord(ch)] def parse_repl_hex_escape(source, expected_len): "Parses a hex escape sequence in a replacement string." digits = [] for i in range(expected_len): ch = source.get() if ch not in HEX_DIGITS: raise error("bad hex escape at position %d" % source.pos) digits.append(ch) return int("".join(digits), 16) def parse_repl_named_char(source): "Parses a named character in a replacement string." saved_pos = source.pos if source.match("{"): name = source.get_while(ALPHA | set(" ")) if source.match("}"): try: value = unicodedata.lookup(name) return ord(value) except KeyError: raise error("undefined character name at position %d" % source.pos) source.pos = saved_pos return None def compile_repl_group(source, pattern): "Compiles a replacement template group reference." source.expect("<") name = parse_name(source, True) source.expect(">") if name.isdigit(): index = int(name) if not 0 <= index <= pattern.groups: raise error("invalid group at position %d" % source.pos) return index try: return pattern.groupindex[name] except KeyError: raise IndexError("unknown group") # The regular expression is parsed into a syntax tree. The different types of # node are defined below. INDENT = " " POSITIVE_OP = 0x1 ZEROWIDTH_OP = 0x2 FUZZY_OP = 0x4 REVERSE_OP = 0x8 REQUIRED_OP = 0x10 POS_TEXT = {False: "NON-MATCH", True: "MATCH"} CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", FULLIGNORECASE: " FULL_IGNORE_CASE"} def make_sequence(items): if len(items) == 1: return items[0] return Sequence(items) # Common base class for all nodes. class RegexBase(object): def __init__(self): self._key = self.__class__ def with_flags(self, positive=None, case_flags=None, zerowidth=None): if positive is None: positive = self.positive else: positive = bool(positive) if case_flags is None: case_flags = self.case_flags else: case_flags = case_flags & CASE_FLAGS if zerowidth is None: zerowidth = self.zerowidth else: zerowidth = bool(zerowidth) if (positive == self.positive and case_flags == self.case_flags and zerowidth == self.zerowidth): return self return self.rebuild(positive, case_flags, zerowidth) def fix_groups(self, reverse, fuzzy): pass def optimise(self, info): return self def pack_characters(self, info): return self def remove_captures(self): return self def is_atomic(self): return True def can_be_affix(self): return True def contains_group(self): return False def get_firstset(self, reverse): raise _FirstSetError() def has_simple_start(self): return False def compile(self, reverse=False, fuzzy=False): return self._compile(reverse, fuzzy) def dump(self, indent, reverse): self._dump(indent, reverse) def is_empty(self): return False def __hash__(self): return hash(self._key) def __eq__(self, other): return type(self) is type(other) and self._key == other._key def __ne__(self, other): return not self.__eq__(other) def get_required_string(self, reverse): return self.max_width(), None # Base class for zero-width nodes. class ZeroWidthBase(RegexBase): def __init__(self, positive=True): RegexBase.__init__(self) self.positive = bool(positive) self._key = self.__class__, self.positive def get_firstset(self, reverse): return set([None]) def _compile(self, reverse, fuzzy): flags = 0 if self.positive: flags |= POSITIVE_OP if fuzzy: flags |= FUZZY_OP if reverse: flags |= REVERSE_OP return [(self._opcode, flags)] def _dump(self, indent, reverse): print "%s%s %s" % (INDENT * indent, self._op_name, POS_TEXT[self.positive]) def max_width(self): return 0 class Any(RegexBase): _opcode = {False: OP.ANY, True: OP.ANY_REV} _op_name = "ANY" def has_simple_start(self): return True def _compile(self, reverse, fuzzy): flags = 0 if fuzzy: flags |= FUZZY_OP return [(self._opcode[reverse], flags)] def _dump(self, indent, reverse): print "%s%s" % (INDENT * indent, self._op_name) def max_width(self): return 1 class AnyAll(Any): _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} _op_name = "ANY_ALL" class AnyU(Any): _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} _op_name = "ANY_U" class Atomic(RegexBase): def __init__(self, subpattern): RegexBase.__init__(self) self.subpattern = subpattern def fix_groups(self, reverse, fuzzy): self.subpattern.fix_groups(reverse, fuzzy) def optimise(self, info): self.subpattern = self.subpattern.optimise(info) if self.subpattern.is_empty(): return self.subpattern return self def pack_characters(self, info): self.subpattern = self.subpattern.pack_characters(info) return self def remove_captures(self): self.subpattern = self.subpattern.remove_captures() return self def can_be_affix(self): return self.subpattern.can_be_affix() def contains_group(self): return self.subpattern.contains_group() def get_firstset(self, reverse): return self.subpattern.get_firstset(reverse) def has_simple_start(self): return self.subpattern.has_simple_start() def _compile(self, reverse, fuzzy): return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + [(OP.END, )]) def _dump(self, indent, reverse): print "%sATOMIC" % (INDENT * indent) self.subpattern.dump(indent + 1, reverse) def is_empty(self): return self.subpattern.is_empty() def __eq__(self, other): return (type(self) is type(other) and self.subpattern == other.subpattern) def max_width(self): return self.subpattern.max_width() def get_required_string(self, reverse): return self.subpattern.get_required_string(reverse) class Boundary(ZeroWidthBase): _opcode = OP.BOUNDARY _op_name = "BOUNDARY" class Branch(RegexBase): def __init__(self, branches): RegexBase.__init__(self) self.branches = branches def fix_groups(self, reverse, fuzzy): for b in self.branches: b.fix_groups(reverse, fuzzy) def optimise(self, info): # Flatten branches within branches. branches = Branch._flatten_branches(info, self.branches) # Move any common prefix or suffix out of the branches. prefix, branches = Branch._split_common_prefix(info, branches) suffix, branches = Branch._split_common_suffix(info, branches) # Merge branches starting with the same character. (If a character # prefix doesn't match in one branch, it won't match in any of the # others starting with that same character.) branches = Branch._merge_common_prefixes(info, branches) # Try to reduce adjacent single-character branches to sets. branches = Branch._reduce_to_set(info, branches) if len(branches) > 1: sequence = prefix + [Branch(branches)] + suffix else: sequence = prefix + branches + suffix return make_sequence(sequence) def optimise(self, info): # Flatten branches within branches. branches = Branch._flatten_branches(info, self.branches) # Try to reduce adjacent single-character branches to sets. branches = Branch._reduce_to_set(info, branches) if len(branches) > 1: sequence = [Branch(branches)] else: sequence = branches return make_sequence(sequence) def pack_characters(self, info): self.branches = [b.pack_characters(info) for b in self.branches] return self def remove_captures(self): self.branches = [b.remove_captures() for b in self.branches] return self def is_atomic(self): return all(b.is_atomic() for b in self.branches) def can_be_affix(self): return all(b.can_be_affix() for b in self.branches) def contains_group(self): return any(b.contains_group() for b in self.branches) def get_firstset(self, reverse): fs = set() for b in self.branches: fs |= b.get_firstset(reverse) return fs or set([None]) def _compile(self, reverse, fuzzy): code = [(OP.BRANCH, )] for b in self.branches: code.extend(b.compile(reverse, fuzzy)) code.append((OP.NEXT, )) code[-1] = (OP.END, ) return code def _dump(self, indent, reverse): print "%sBRANCH" % (INDENT * indent) self.branches[0].dump(indent + 1, reverse) for b in self.branches[1 : ]: print "%sOR" % (INDENT * indent) b.dump(indent + 1, reverse) @staticmethod def _flatten_branches(info, branches): # Flatten the branches so that there aren't branches of branches. new_branches = [] for b in branches: b = b.optimise(info) if isinstance(b, Branch): new_branches.extend(b.branches) else: new_branches.append(b) return new_branches @staticmethod def _split_common_prefix(info, branches): # Common leading items can be moved out of the branches. # Get the items in the branches. alternatives = [] for b in branches: if isinstance(b, Sequence): alternatives.append(b.items) else: alternatives.append([b]) # What is the maximum possible length of the prefix? max_count = min(len(a) for a in alternatives) # What is the longest common prefix? prefix = alternatives[0] pos = 0 end_pos = max_count while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == prefix[pos] for a in alternatives): pos += 1 count = pos if info.flags & UNICODE: # We need to check that we're not splitting a sequence of # characters which could form part of full case-folding. count = pos while count > 0 and not all(Branch._can_split(a, count) for a in alternatives): count -= 1 # No common prefix is possible. if count == 0: return [], branches # Rebuild the branches. new_branches = [] for a in alternatives: new_branches.append(make_sequence(a[count : ])) return prefix[ : count], new_branches @staticmethod def _split_common_suffix(info, branches): # Common trailing items can be moved out of the branches. # Get the items in the branches. alternatives = [] for b in branches: if isinstance(b, Sequence): alternatives.append(b.items) else: alternatives.append([b]) # What is the maximum possible length of the suffix? max_count = min(len(a) for a in alternatives) # What is the longest common suffix? suffix = alternatives[0] pos = -1 end_pos = -1 - max_count while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == suffix[pos] for a in alternatives): pos -= 1 count = -1 - pos if info.flags & UNICODE: # We need to check that we're not splitting a sequence of # characters which could form part of full case-folding. while count > 0 and not all(Branch._can_split_rev(a, count) for a in alternatives): count -= 1 # No common suffix is possible. if count == 0: return [], branches # Rebuild the branches. new_branches = [] for a in alternatives: new_branches.append(make_sequence(a[ : -count])) return suffix[-count : ], new_branches @staticmethod def _can_split(items, count): # Check the characters either side of the proposed split. if not Branch._is_full_case(items, count - 1): return True if not Branch._is_full_case(items, count): return True # Check whether a 1-1 split would be OK. if Branch._is_folded(items[count - 1 : count + 1]): return False # Check whether a 1-2 split would be OK. if (Branch._is_full_case(items, count + 2) and Branch._is_folded(items[count - 1 : count + 2])): return False # Check whether a 2-1 split would be OK. if (Branch._is_full_case(items, count - 2) and Branch._is_folded(items[count - 2 : count + 1])): return False return True @staticmethod def _can_split_rev(items, count): end = len(items) # Check the characters either side of the proposed split. if not Branch._is_full_case(items, end - count): return True if not Branch._is_full_case(items, end - count - 1): return True # Check whether a 1-1 split would be OK. if Branch._is_folded(items[end - count - 1 : end - count + 1]): return False # Check whether a 1-2 split would be OK. if (Branch._is_full_case(items, end - count + 2) and Branch._is_folded(items[end - count - 1 : end - count + 2])): return False # Check whether a 2-1 split would be OK. if (Branch._is_full_case(items, end - count - 2) and Branch._is_folded(items[end - count - 2 : end - count + 1])): return False return True @staticmethod def _merge_common_prefixes(info, branches): # Branches with the same case-sensitive character prefix can be grouped # together if they are separated only by other branches with a # character prefix. prefixed = defaultdict(list) order = {} new_branches = [] for b in branches: if Branch._is_simple_character(b): # Branch starts with a simple character. prefixed[b.value].append([b]) order.setdefault(b.value, len(order)) elif (isinstance(b, Sequence) and b.items and Branch._is_simple_character(b.items[0])): # Branch starts with a simple character. prefixed[b.items[0].value].append(b.items) order.setdefault(b.items[0].value, len(order)) else: Branch._flush_char_prefix(info, prefixed, order, new_branches) new_branches.append(b) Branch._flush_char_prefix(info, prefixed, order, new_branches) return new_branches @staticmethod def _is_simple_character(c): return isinstance(c, Character) and c.positive and not c.case_flags @staticmethod def _reduce_to_set(info, branches): # Can the branches be reduced to a set? new_branches = [] items = set() case_flags = NOCASE for b in branches: if isinstance(b, (Character, Property, SetBase)): # Branch starts with a single character. if b.case_flags != case_flags: # Different case sensitivity, so flush. Branch._flush_set_members(info, items, case_flags, new_branches) case_flags = b.case_flags items.add(b.with_flags(case_flags=NOCASE)) else: Branch._flush_set_members(info, items, case_flags, new_branches) new_branches.append(b) Branch._flush_set_members(info, items, case_flags, new_branches) return new_branches @staticmethod def _flush_char_prefix(info, prefixed, order, new_branches): # Flush the prefixed branches. if not prefixed: return for value, branches in sorted(prefixed.items(), key=lambda pair: order[pair[0]]): if len(branches) == 1: new_branches.append(make_sequence(branches[0])) else: subbranches = [] optional = False for b in branches: if len(b) > 1: subbranches.append(make_sequence(b[1 : ])) elif not optional: subbranches.append(Sequence()) optional = True sequence = Sequence([Character(value), Branch(subbranches)]) new_branches.append(sequence.optimise(info)) prefixed.clear() order.clear() @staticmethod def _flush_set_members(info, items, case_flags, new_branches): # Flush the set members. if not items: return if len(items) == 1: item = list(items)[0] else: item = SetUnion(info, list(items)).optimise(info) new_branches.append(item.with_flags(case_flags=case_flags)) items.clear() @staticmethod def _is_full_case(items, i): if not 0 <= i < len(items): return False item = items[i] return (isinstance(item, Character) and item.positive and (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) @staticmethod def _is_folded(items): if len(items) < 2: return False for i in items: if (not isinstance(i, Character) or not i.positive or not i.case_flags): return False folded = u"".join(unichr(i.value) for i in items) folded = _regex.fold_case(FULL_CASE_FOLDING, folded) # Get the characters which expand to multiple codepoints on folding. expanding_chars = _regex.get_expand_on_folding() for c in expanding_chars: if folded == _regex.fold_case(FULL_CASE_FOLDING, c): return True return False def is_empty(self): return all(b.is_empty() for b in self.branches) def __eq__(self, other): return type(self) is type(other) and self.branches == other.branches def max_width(self): return max(b.max_width() for b in self.branches) class CallGroup(RegexBase): def __init__(self, info, group, position): RegexBase.__init__(self) self.info = info self.group = group self.position = position self._key = self.__class__, self.group def fix_groups(self, reverse, fuzzy): try: self.group = int(self.group) except ValueError: try: self.group = self.info.group_index[self.group] except KeyError: raise error("unknown group at position %d" % self.position) if not 0 <= self.group <= self.info.group_count: raise error("unknown group at position %d" % self.position) if self.group > 0 and self.info.open_group_count[self.group] > 1: raise error("ambiguous group reference at position %d" % self.position) self.info.group_calls.append((self, reverse, fuzzy)) self._key = self.__class__, self.group def remove_captures(self): raise error("group reference not allowed at position %d" % self.position) def _compile(self, reverse, fuzzy): return [(OP.GROUP_CALL, self.call_ref)] def _dump(self, indent, reverse): print "%sGROUP_CALL %s" % (INDENT * indent, self.group) def __eq__(self, other): return type(self) is type(other) and self.group == other.group def max_width(self): return UNLIMITED class Character(RegexBase): _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} def __init__(self, value, positive=True, case_flags=NOCASE, zerowidth=False): RegexBase.__init__(self) self.value = value self.positive = bool(positive) self.case_flags = case_flags self.zerowidth = bool(zerowidth) if (self.positive and (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE): self.folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(self.value)) else: self.folded = unichr(self.value) self._key = (self.__class__, self.value, self.positive, self.case_flags, self.zerowidth) def rebuild(self, positive, case_flags, zerowidth): return Character(self.value, positive, case_flags, zerowidth) def optimise(self, info, in_set=False): return self def get_firstset(self, reverse): return set([self]) def has_simple_start(self): return True def _compile(self, reverse, fuzzy): flags = 0 if self.positive: flags |= POSITIVE_OP if self.zerowidth: flags |= ZEROWIDTH_OP if fuzzy: flags |= FUZZY_OP code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, self.value]) if len(self.folded) > 1: # The character expands on full case-folding. code = Branch([code, String([ord(c) for c in self.folded], case_flags=self.case_flags)]) return code.compile(reverse, fuzzy) def _dump(self, indent, reverse): display = repr(unichr(self.value)).lstrip("bu") print "%sCHARACTER %s %s%s" % (INDENT * indent, POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]) def matches(self, ch): return (ch == self.value) == self.positive def max_width(self): return len(self.folded) def get_required_string(self, reverse): if not self.positive: return 1, None self.folded_characters = tuple(ord(c) for c in self.folded) return 0, self class Conditional(RegexBase): def __init__(self, info, group, yes_item, no_item, position): RegexBase.__init__(self) self.info = info self.group = group self.yes_item = yes_item self.no_item = no_item self.position = position def fix_groups(self, reverse, fuzzy): try: self.group = int(self.group) except ValueError: try: self.group = self.info.group_index[self.group] except KeyError: raise error("unknown group at position %d" % self.position) if not 1 <= self.group <= self.info.group_count: raise error("unknown group at position %d" % self.position) self.yes_item.fix_groups(reverse, fuzzy) self.no_item.fix_groups(reverse, fuzzy) def optimise(self, info): yes_item = self.yes_item.optimise(info) no_item = self.no_item.optimise(info) return Conditional(info, self.group, yes_item, no_item, self.position) def pack_characters(self, info): self.yes_item = self.yes_item.pack_characters(info) self.no_item = self.no_item.pack_characters(info) return self def remove_captures(self): self.yes_item = self.yes_item.remove_captures() self.no_item = self.no_item.remove_captures() def is_atomic(self): return self.yes_item.is_atomic() and self.no_item.is_atomic() def can_be_affix(self): return self.yes_item.can_be_affix() and self.no_item.can_be_affix() def contains_group(self): return self.yes_item.contains_group() or self.no_item.contains_group() def get_firstset(self, reverse): return (self.yes_item.get_firstset(reverse) | self.no_item.get_firstset(reverse)) def _compile(self, reverse, fuzzy): code = [(OP.GROUP_EXISTS, self.group)] code.extend(self.yes_item.compile(reverse, fuzzy)) add_code = self.no_item.compile(reverse, fuzzy) if add_code: code.append((OP.NEXT, )) code.extend(add_code) code.append((OP.END, )) return code def _dump(self, indent, reverse): print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group) self.yes_item.dump(indent + 1, reverse) if self.no_item: print "%sOR" % (INDENT * indent) self.no_item.dump(indent + 1, reverse) def is_empty(self): return self.yes_item.is_empty() and self.no_item.is_empty() def __eq__(self, other): return type(self) is type(other) and (self.group, self.yes_item, self.no_item) == (other.group, other.yes_item, other.no_item) def max_width(self): return max(self.yes_item.max_width(), self.no_item.max_width()) class DefaultBoundary(ZeroWidthBase): _opcode = OP.DEFAULT_BOUNDARY _op_name = "DEFAULT_BOUNDARY" class DefaultEndOfWord(ZeroWidthBase): _opcode = OP.DEFAULT_END_OF_WORD _op_name = "DEFAULT_END_OF_WORD" class DefaultStartOfWord(ZeroWidthBase): _opcode = OP.DEFAULT_START_OF_WORD _op_name = "DEFAULT_START_OF_WORD" class EndOfLine(ZeroWidthBase): _opcode = OP.END_OF_LINE _op_name = "END_OF_LINE" class EndOfLineU(EndOfLine): _opcode = OP.END_OF_LINE_U _op_name = "END_OF_LINE_U" class EndOfString(ZeroWidthBase): _opcode = OP.END_OF_STRING _op_name = "END_OF_STRING" class EndOfStringLine(ZeroWidthBase): _opcode = OP.END_OF_STRING_LINE _op_name = "END_OF_STRING_LINE" class EndOfStringLineU(EndOfStringLine): _opcode = OP.END_OF_STRING_LINE_U _op_name = "END_OF_STRING_LINE_U" class EndOfWord(ZeroWidthBase): _opcode = OP.END_OF_WORD _op_name = "END_OF_WORD" class Fuzzy(RegexBase): def __init__(self, subpattern, constraints=None): RegexBase.__init__(self) if constraints is None: constraints = {} self.subpattern = subpattern self.constraints = constraints # If an error type is mentioned in the cost equation, then its maximum # defaults to unlimited. if "cost" in constraints: for e in "dis": if e in constraints["cost"]: constraints.setdefault(e, (0, None)) # If any error type is mentioned, then all the error maxima default to # 0, otherwise they default to unlimited. if set(constraints) & set("dis"): for e in "dis": constraints.setdefault(e, (0, 0)) else: for e in "dis": constraints.setdefault(e, (0, None)) # The maximum of the generic error type defaults to unlimited. constraints.setdefault("e", (0, None)) # The cost equation defaults to equal costs. Also, the cost of any # error type not mentioned in the cost equation defaults to 0. if "cost" in constraints: for e in "dis": constraints["cost"].setdefault(e, 0) else: constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": constraints["e"][1]} def fix_groups(self, reverse, fuzzy): self.subpattern.fix_groups(reverse, True) def pack_characters(self, info): self.subpattern = self.subpattern.pack_characters(info) return self def remove_captures(self): self.subpattern = self.subpattern.remove_captures() return self def is_atomic(self): return self.subpattern.is_atomic() def contains_group(self): return self.subpattern.contains_group() def _compile(self, reverse, fuzzy): # The individual limits. arguments = [] for e in "dise": v = self.constraints[e] arguments.append(v[0]) arguments.append(UNLIMITED if v[1] is None else v[1]) # The coeffs of the cost equation. for e in "dis": arguments.append(self.constraints["cost"][e]) # The maximum of the cost equation. v = self.constraints["cost"]["max"] arguments.append(UNLIMITED if v is None else v) flags = 0 if reverse: flags |= REVERSE_OP return ([(OP.FUZZY, flags) + tuple(arguments)] + self.subpattern.compile(reverse, True) + [(OP.END,)]) def _dump(self, indent, reverse): constraints = self._constraints_to_string() if constraints: constraints = " " + constraints print "%sFUZZY%s" % (INDENT * indent, constraints) self.subpattern.dump(indent + 1, reverse) def is_empty(self): return self.subpattern.is_empty() def __eq__(self, other): return (type(self) is type(other) and self.subpattern == other.subpattern) def max_width(self): return UNLIMITED def _constraints_to_string(self): constraints = [] for name in "ids": min, max = self.constraints[name] if max == 0: continue con = "" if min > 0: con = "%s<=" % min con += name if max is not None: con += "<=%s" % max constraints.append(con) cost = [] for name in "ids": coeff = self.constraints["cost"][name] if coeff > 0: cost.append("%s%s" % (coeff, name)) limit = self.constraints["cost"]["max"] if limit is not None and limit > 0: cost = "%s<=%s" % ("+".join(cost), limit) constraints.append(cost) return ",".join(constraints) class Grapheme(RegexBase): def _compile(self, reverse, fuzzy): # Match at least 1 character until a grapheme boundary is reached. Note # that this is the same whether matching forwards or backwards. character_matcher = LazyRepeat(AnyAll(), 1, None).compile(reverse, fuzzy) boundary_matcher = [(OP.GRAPHEME_BOUNDARY, 1)] return character_matcher + boundary_matcher def _dump(self, indent, reverse): print "%sGRAPHEME" % (INDENT * indent) def max_width(self): return UNLIMITED class GreedyRepeat(RegexBase): _opcode = OP.GREEDY_REPEAT _op_name = "GREEDY_REPEAT" def __init__(self, subpattern, min_count, max_count): RegexBase.__init__(self) self.subpattern = subpattern self.min_count = min_count self.max_count = max_count def fix_groups(self, reverse, fuzzy): self.subpattern.fix_groups(reverse, fuzzy) def optimise(self, info): subpattern = self.subpattern.optimise(info) return type(self)(subpattern, self.min_count, self.max_count) def pack_characters(self, info): self.subpattern = self.subpattern.pack_characters(info) return self def remove_captures(self): self.subpattern = self.subpattern.remove_captures() return self def is_atomic(self): return self.min_count == self.max_count and self.subpattern.is_atomic() def contains_group(self): return self.subpattern.contains_group() def get_firstset(self, reverse): fs = self.subpattern.get_firstset(reverse) if self.min_count == 0: fs.add(None) return fs def _compile(self, reverse, fuzzy): repeat = [self._opcode, self.min_count] if self.max_count is None: repeat.append(UNLIMITED) else: repeat.append(self.max_count) subpattern = self.subpattern.compile(reverse, fuzzy) if not subpattern: return [] return ([tuple(repeat)] + subpattern + [(OP.END, )]) def _dump(self, indent, reverse): if self.max_count is None: limit = "INF" else: limit = self.max_count print "%s%s %s %s" % (INDENT * indent, self._op_name, self.min_count, limit) self.subpattern.dump(indent + 1, reverse) def is_empty(self): return self.subpattern.is_empty() def __eq__(self, other): return type(self) is type(other) and (self.subpattern, self.min_count, self.max_count) == (other.subpattern, other.min_count, other.max_count) def max_width(self): if self.max_count is None: return UNLIMITED return self.subpattern.max_width() * self.max_count def get_required_string(self, reverse): max_count = UNLIMITED if self.max_count is None else self.max_count if self.min_count == 0: w = self.subpattern.max_width() * max_count return min(w, UNLIMITED), None ofs, req = self.subpattern.get_required_string(reverse) if req: return ofs, req w = self.subpattern.max_width() * max_count return min(w, UNLIMITED), None class Group(RegexBase): def __init__(self, info, group, subpattern): RegexBase.__init__(self) self.info = info self.group = group self.subpattern = subpattern self.call_ref = None def fix_groups(self, reverse, fuzzy): self.info.defined_groups[self.group] = (self, reverse, fuzzy) self.subpattern.fix_groups(reverse, fuzzy) def optimise(self, info): subpattern = self.subpattern.optimise(info) return Group(self.info, self.group, subpattern) def pack_characters(self, info): self.subpattern = self.subpattern.pack_characters(info) return self def remove_captures(self): return self.subpattern.remove_captures() def is_atomic(self): return self.subpattern.is_atomic() def can_be_affix(self): return False def contains_group(self): return True def get_firstset(self, reverse): return self.subpattern.get_firstset(reverse) def has_simple_start(self): return self.subpattern.has_simple_start() def _compile(self, reverse, fuzzy): code = [] key = self.group, reverse, fuzzy ref = self.info.call_refs.get(key) if ref is not None: code += [(OP.CALL_REF, ref)] public_group = private_group = self.group if private_group < 0: public_group = self.info.private_groups[private_group] private_group = self.info.group_count - private_group code += ([(OP.GROUP, private_group, public_group)] + self.subpattern.compile(reverse, fuzzy) + [(OP.END, )]) if ref is not None: code += [(OP.END, )] return code def _dump(self, indent, reverse): group = self.group if group < 0: group = private_groups[group] print "%sGROUP %s" % (INDENT * indent, group) self.subpattern.dump(indent + 1, reverse) def __eq__(self, other): return (type(self) is type(other) and (self.group, self.subpattern) == (other.group, other.subpattern)) def max_width(self): return self.subpattern.max_width() def get_required_string(self, reverse): return self.subpattern.get_required_string(reverse) class LazyRepeat(GreedyRepeat): _opcode = OP.LAZY_REPEAT _op_name = "LAZY_REPEAT" class LookAround(RegexBase): _dir_text = {False: "AHEAD", True: "BEHIND"} def __new__(cls, behind, positive, subpattern): if positive and subpattern.is_empty(): return subpattern return RegexBase.__new__(cls) def __init__(self, behind, positive, subpattern): RegexBase.__init__(self) self.behind = bool(behind) self.positive = bool(positive) self.subpattern = subpattern def fix_groups(self, reverse, fuzzy): self.subpattern.fix_groups(self.behind, fuzzy) def optimise(self, info): subpattern = self.subpattern.optimise(info) return LookAround(self.behind, self.positive, subpattern) def pack_characters(self, info): self.subpattern = self.subpattern.pack_characters(info) return self def remove_captures(self): return self.subpattern.remove_captures() def is_atomic(self): return self.subpattern.is_atomic() def can_be_affix(self): return self.subpattern.can_be_affix() def contains_group(self): return self.subpattern.contains_group() def _compile(self, reverse, fuzzy): return ([(OP.LOOKAROUND, int(self.positive), int(not self.behind))] + self.subpattern.compile(self.behind) + [(OP.END, )]) def _dump(self, indent, reverse): print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind], POS_TEXT[self.positive]) self.subpattern.dump(indent + 1, self.behind) def is_empty(self): return self.subpattern.is_empty() def __eq__(self, other): return type(self) is type(other) and (self.behind, self.positive, self.subpattern) == (other.behind, other.positive, other.subpattern) def max_width(self): return 0 class PrecompiledCode(RegexBase): def __init__(self, code): self.code = code def _compile(self, reverse, fuzzy): return [tuple(self.code)] class Property(RegexBase): _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, True): OP.PROPERTY_IGN_REV} def __init__(self, value, positive=True, case_flags=NOCASE, zerowidth=False): RegexBase.__init__(self) self.value = value self.positive = bool(positive) self.case_flags = case_flags self.zerowidth = bool(zerowidth) self._key = (self.__class__, self.value, self.positive, self.case_flags, self.zerowidth) def rebuild(self, positive, case_flags, zerowidth): return Property(self.value, positive, case_flags, zerowidth) def optimise(self, info, in_set=False): return self def get_firstset(self, reverse): return set([self]) def has_simple_start(self): return True def _compile(self, reverse, fuzzy): flags = 0 if self.positive: flags |= POSITIVE_OP if self.zerowidth: flags |= ZEROWIDTH_OP if fuzzy: flags |= FUZZY_OP return [(self._opcode[self.case_flags, reverse], flags, self.value)] def _dump(self, indent, reverse): prop = PROPERTY_NAMES[self.value >> 16] name, value = prop[0], prop[1][self.value & 0xFFFF] print "%sPROPERTY %s %s:%s%s" % (INDENT * indent, POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]) def matches(self, ch): return _regex.has_property_value(self.value, ch) == self.positive def max_width(self): return 1 class Range(RegexBase): _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} _op_name = "RANGE" def __init__(self, lower, upper, positive=True, case_flags=NOCASE, zerowidth=False): RegexBase.__init__(self) self.lower = lower self.upper = upper self.positive = bool(positive) self.case_flags = case_flags self.zerowidth = bool(zerowidth) self._key = (self.__class__, self.lower, self.upper, self.positive, self.case_flags, self.zerowidth) def rebuild(self, positive, case_flags, zerowidth): return Range(self.lower, self.upper, positive, case_flags, zerowidth) def optimise(self, info, in_set=False): # Is the range case-sensitive? if not self.positive or not (self.case_flags & IGNORECASE) or in_set: return self # Is full case-folding possible? if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != FULLIGNORECASE): return self # Get the characters which expand to multiple codepoints on folding. expanding_chars = _regex.get_expand_on_folding() # Get the folded characters in the range. items = [] for ch in expanding_chars: if self.lower <= ord(ch) <= self.upper: folded = _regex.fold_case(FULL_CASE_FOLDING, ch) items.append(String([ord(c) for c in folded], case_flags=self.case_flags)) if not items: # We can fall back to simple case-folding. return self if len(items) < self.upper - self.lower + 1: # Not all the characters are covered by the full case-folding. items.insert(0, self) return Branch(items) def _compile(self, reverse, fuzzy): flags = 0 if self.positive: flags |= POSITIVE_OP if self.zerowidth: flags |= ZEROWIDTH_OP if fuzzy: flags |= FUZZY_OP return [(self._opcode[self.case_flags, reverse], flags, self.lower, self.upper)] def _dump(self, indent, reverse): display_lower = repr(unichr(self.lower)).lstrip("bu") display_upper = repr(unichr(self.upper)).lstrip("bu") print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive], display_lower, display_upper, CASE_TEXT[self.case_flags]) def matches(self, ch): return (self.lower <= ch <= self.upper) == self.positive def max_width(self): return 1 class RefGroup(RegexBase): _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} def __init__(self, info, group, position, case_flags=NOCASE): RegexBase.__init__(self) self.info = info self.group = group self.position = position self.case_flags = case_flags self._key = self.__class__, self.group, self.case_flags def fix_groups(self, reverse, fuzzy): try: self.group = int(self.group) except ValueError: try: self.group = self.info.group_index[self.group] except KeyError: raise error("unknown group at position %d" % self.position) if not 1 <= self.group <= self.info.group_count: raise error("unknown group at position %d" % self.position) self._key = self.__class__, self.group, self.case_flags def remove_captures(self): raise error("group reference not allowed at position %d" % self.position) def _compile(self, reverse, fuzzy): flags = 0 if fuzzy: flags |= FUZZY_OP return [(self._opcode[self.case_flags, reverse], flags, self.group)] def _dump(self, indent, reverse): print "%sREF_GROUP %s%s" % (INDENT * indent, self.group, CASE_TEXT[self.case_flags]) def max_width(self): return UNLIMITED class SearchAnchor(ZeroWidthBase): _opcode = OP.SEARCH_ANCHOR _op_name = "SEARCH_ANCHOR" class Sequence(RegexBase): def __init__(self, items=None): RegexBase.__init__(self) if items is None: items = [] self.items = items def fix_groups(self, reverse, fuzzy): for s in self.items: s.fix_groups(reverse, fuzzy) def optimise(self, info): # Flatten the sequences. items = [] for s in self.items: s = s.optimise(info) if isinstance(s, Sequence): items.extend(s.items) else: items.append(s) return make_sequence(items) def pack_characters(self, info): "Packs sequences of characters into strings." items = [] characters = [] case_flags = NOCASE for s in self.items: if type(s) is Character and s.positive: if s.case_flags != case_flags: # Different case sensitivity, so flush, unless neither the # previous nor the new character are cased. if s.case_flags or is_cased(info, s.value): Sequence._flush_characters(info, characters, case_flags, items) case_flags = s.case_flags characters.append(s.value) elif type(s) is String or type(s) is Literal: if s.case_flags != case_flags: # Different case sensitivity, so flush, unless the neither # the previous nor the new string are cased. if s.case_flags or any(is_cased(info, c) for c in characters): Sequence._flush_characters(info, characters, case_flags, items) case_flags = s.case_flags characters.extend(s.characters) else: Sequence._flush_characters(info, characters, case_flags, items) items.append(s.pack_characters(info)) Sequence._flush_characters(info, characters, case_flags, items) return make_sequence(items) def remove_captures(self): self.items = [s.remove_captures() for s in self.items] return self def is_atomic(self): return all(s.is_atomic() for s in self.items) def can_be_affix(self): return False def contains_group(self): return any(s.contains_group() for s in self.items) def get_firstset(self, reverse): fs = set() items = self.items if reverse: items.reverse() for s in items: fs |= s.get_firstset(reverse) if None not in fs: return fs fs.discard(None) return fs | set([None]) def has_simple_start(self): return self.items and self.items[0].has_simple_start() def _compile(self, reverse, fuzzy): seq = self.items if reverse: seq = seq[::-1] code = [] for s in seq: code.extend(s.compile(reverse, fuzzy)) return code def _dump(self, indent, reverse): for s in self.items: s.dump(indent, reverse) @staticmethod def _flush_characters(info, characters, case_flags, items): if not characters: return # Disregard case_flags if all of the characters are case-less. if case_flags & IGNORECASE: if not any(is_cased(info, c) for c in characters): case_flags = NOCASE if len(characters) == 1: items.append(Character(characters[0], case_flags=case_flags)) else: items.append(String(characters, case_flags=case_flags)) characters[:] = [] def is_empty(self): return all(i.is_empty() for i in self.items) def __eq__(self, other): return type(self) is type(other) and self.items == other.items def max_width(self): return sum(s.max_width() for s in self.items) def get_required_string(self, reverse): seq = self.items if reverse: seq = seq[::-1] offset = 0 for s in seq: ofs, req = s.get_required_string(reverse) offset += ofs if req: return offset, req return offset, None class SetBase(RegexBase): def __init__(self, info, items, positive=True, case_flags=NOCASE, zerowidth=False): RegexBase.__init__(self) self.info = info self.items = tuple(items) self.positive = bool(positive) self.case_flags = case_flags self.zerowidth = bool(zerowidth) self.char_width = 1 self._key = (self.__class__, self.items, self.positive, self.case_flags, self.zerowidth) def rebuild(self, positive, case_flags, zerowidth): return type(self)(self.info, self.items, positive, case_flags, zerowidth).optimise(self.info) def get_firstset(self, reverse): return set([self]) def has_simple_start(self): return True def _compile(self, reverse, fuzzy): flags = 0 if self.positive: flags |= POSITIVE_OP if self.zerowidth: flags |= ZEROWIDTH_OP if fuzzy: flags |= FUZZY_OP code = [(self._opcode[self.case_flags, reverse], flags)] for m in self.items: code.extend(m.compile()) code.append((OP.END, )) return code def _dump(self, indent, reverse): print "%s%s %s%s" % (INDENT * indent, self._op_name, POS_TEXT[self.positive], CASE_TEXT[self.case_flags]) for i in self.items: i.dump(indent + 1) def _handle_case_folding(self, info, in_set): # Is the set case-sensitive? if not self.positive or not (self.case_flags & IGNORECASE) or in_set: return self # Is full case-folding possible? if (not (self.info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != FULLIGNORECASE): return self # Get the characters which expand to multiple codepoints on folding. expanding_chars = _regex.get_expand_on_folding() # Get the folded characters in the set. items = [] seen = set() for ch in expanding_chars: if self.matches(ord(ch)): folded = _regex.fold_case(FULL_CASE_FOLDING, ch) if folded not in seen: items.append(String([ord(c) for c in folded], case_flags=self.case_flags)) seen.add(folded) if not items: # We can fall back to simple case-folding. return self return Branch([self] + items) def max_width(self): # Is the set case-sensitive? if not self.positive or not (self.case_flags & IGNORECASE): return 1 # Is full case-folding possible? if (not (self.info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != FULLIGNORECASE): return 1 # Get the characters which expand to multiple codepoints on folding. expanding_chars = _regex.get_expand_on_folding() # Get the folded characters in the set. seen = set() for ch in expanding_chars: if self.matches(ord(ch)): folded = _regex.fold_case(FULL_CASE_FOLDING, ch) seen.add(folded) if not seen: return 1 return max(len(folded) for folded in seen) class SetDiff(SetBase): _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, True): OP.SET_DIFF_IGN_REV} _op_name = "SET_DIFF" def optimise(self, info, in_set=False): items = self.items if len(items) > 2: items = [items[0], SetUnion(info, items[1 : ])] if len(items) == 1: return items[0].with_flags(case_flags=self.case_flags, zerowidth=self.zerowidth).optimise(info, in_set) self.items = tuple(m.optimise(info, in_set=True) for m in items) return self._handle_case_folding(info, in_set) def matches(self, ch): m = self.items[0].matches(ch) and not self.items[1].matches(ch) return m == self.positive class SetInter(SetBase): _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} _op_name = "SET_INTER" def optimise(self, info, in_set=False): items = [] for m in self.items: m = m.optimise(info, in_set=True) if isinstance(m, SetInter) and m.positive: # Intersection in intersection. items.extend(m.items) else: items.append(m) if len(items) == 1: return items[0].with_flags(case_flags=self.case_flags, zerowidth=self.zerowidth).optimise(info, in_set) self.items = tuple(items) return self._handle_case_folding(info, in_set) def matches(self, ch): m = all(i.matches(ch) for i in self.items) return m == self.positive class SetSymDiff(SetBase): _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} _op_name = "SET_SYM_DIFF" def optimise(self, info, in_set=False): items = [] for m in self.items: m = m.optimise(info, in_set=True) if isinstance(m, SetSymDiff) and m.positive: # Symmetric difference in symmetric difference. items.extend(m.items) else: items.append(m) if len(items) == 1: return items[0].with_flags(case_flags=self.case_flags, zerowidth=self.zerowidth).optimise(info, in_set) self.items = tuple(items) return self._handle_case_folding(info, in_set) def matches(self, ch): m = False for i in self.items: m = m != i.matches(ch) return m == self.positive class SetUnion(SetBase): _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} _op_name = "SET_UNION" def optimise(self, info, in_set=False): items = [] for m in self.items: m = m.optimise(info, in_set=True) if isinstance(m, SetUnion) and m.positive: # Union in union. items.extend(m.items) else: items.append(m) if len(items) == 1: i = items[0] return i.with_flags(positive=i.positive == self.positive, case_flags=self.case_flags, zerowidth=self.zerowidth).optimise(info, in_set) self.items = tuple(items) return self._handle_case_folding(info, in_set) def _compile(self, reverse, fuzzy): flags = 0 if self.positive: flags |= POSITIVE_OP if self.zerowidth: flags |= ZEROWIDTH_OP if fuzzy: flags |= FUZZY_OP characters, others = defaultdict(list), [] for m in self.items: if isinstance(m, Character): characters[m.positive].append(m.value) else: others.append(m) code = [(self._opcode[self.case_flags, reverse], flags)] for positive, values in characters.items(): flags = 0 if positive: flags |= POSITIVE_OP if len(values) == 1: code.append((OP.CHARACTER, flags, values[0])) else: code.append((OP.STRING, flags, len(values)) + tuple(values)) for m in others: code.extend(m.compile()) code.append((OP.END, )) return code def matches(self, ch): m = any(i.matches(ch) for i in self.items) return m == self.positive class StartOfLine(ZeroWidthBase): _opcode = OP.START_OF_LINE _op_name = "START_OF_LINE" class StartOfLineU(StartOfLine): _opcode = OP.START_OF_LINE_U _op_name = "START_OF_LINE_U" class StartOfString(ZeroWidthBase): _opcode = OP.START_OF_STRING _op_name = "START_OF_STRING" class StartOfWord(ZeroWidthBase): _opcode = OP.START_OF_WORD _op_name = "START_OF_WORD" class String(RegexBase): _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): OP.STRING_FLD_REV} def __init__(self, characters, case_flags=NOCASE): self.characters = tuple(characters) self.case_flags = case_flags if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: folded_characters = [] for char in self.characters: folded = _regex.fold_case(FULL_CASE_FOLDING, unichr(char)) folded_characters.extend(ord(c) for c in folded) else: folded_characters = self.characters self.folded_characters = tuple(folded_characters) self.required = False self._key = self.__class__, self.characters, self.case_flags def get_firstset(self, reverse): if reverse: pos = -1 else: pos = 0 return set([Character(self.characters[pos], case_flags=self.case_flags)]) def has_simple_start(self): return True def _compile(self, reverse, fuzzy): flags = 0 if fuzzy: flags |= FUZZY_OP if self.required: flags |= REQUIRED_OP return [(self._opcode[self.case_flags, reverse], flags, len(self.folded_characters)) + self.folded_characters] def _dump(self, indent, reverse): display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu") print "%sSTRING %s%s" % (INDENT * indent, display, CASE_TEXT[self.case_flags]) def max_width(self): return len(self.folded_characters) def get_required_string(self, reverse): return 0, self class Literal(String): def _dump(self, indent, reverse): for c in self.characters: display = ascii("".join(chr(c))).lstrip("bu") print("{}CHARACTER MATCH {}{}".format(INDENT * indent, display, CASE_TEXT[self.case_flags])) class StringSet(RegexBase): _opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False): OP.STRING_SET_IGN, (FULLCASE, False): OP.STRING_SET, (FULLIGNORECASE, False): OP.STRING_SET_FLD, (NOCASE, True): OP.STRING_SET_REV, (IGNORECASE, True): OP.STRING_SET_IGN_REV, (FULLCASE, True): OP.STRING_SET_REV, (FULLIGNORECASE, True): OP.STRING_SET_FLD_REV} def __init__(self, info, name, case_flags=NOCASE): self.info = info self.name = name self.case_flags = case_flags self._key = self.__class__, self.name, self.case_flags self.set_key = (name, self.case_flags) if self.set_key not in info.named_lists_used: info.named_lists_used[self.set_key] = len(info.named_lists_used) def _compile(self, reverse, fuzzy): index = self.info.named_lists_used[self.set_key] items = self.info.kwargs[self.name] case_flags = self.case_flags if not items: return [] encoding = self.info.flags & _ALL_ENCODINGS fold_flags = encoding | case_flags if fuzzy: choices = [self._folded(fold_flags, i) for i in items] # Sort from longest to shortest. choices.sort(key=lambda s: (-len(s), s)) branches = [] for string in choices: branches.append(Sequence([Character(c, case_flags=case_flags) for c in string])) if len(branches) > 1: branch = Branch(branches) else: branch = branches[0] branch = branch.optimise(self.info).pack_characters(self.info) return branch.compile(reverse, fuzzy) else: min_len = min(len(i) for i in items) max_len = max(len(self._folded(fold_flags, i)) for i in items) return [(self._opcode[case_flags, reverse], index, min_len, max_len)] def _dump(self, indent, reverse): print "%sSTRING_SET %s%s" % (INDENT * indent, self.name, CASE_TEXT[self.case_flags]) def _folded(self, fold_flags, item): if isinstance(item, unicode): return [ord(c) for c in _regex.fold_case(fold_flags, item)] else: return [ord(c) for c in item] def _flatten(self, s): # Flattens the branches. if isinstance(s, Branch): for b in s.branches: self._flatten(b) elif isinstance(s, Sequence) and s.items: seq = s.items while isinstance(seq[-1], Sequence): seq[-1 : ] = seq[-1].items n = 0 while n < len(seq) and isinstance(seq[n], Character): n += 1 if n > 1: seq[ : n] = [String([c.value for c in seq[ : n]], case_flags=self.case_flags)] self._flatten(seq[-1]) def max_width(self): if not self.info.kwargs[self.name]: return 0 if self.case_flags & IGNORECASE: fold_flags = (self.info.flags & _ALL_ENCODINGS) | self.case_flags return max(len(_regex.fold_case(fold_flags, i)) for i in self.info.kwargs[self.name]) else: return max(len(i) for i in self.info.kwargs[self.name]) class Source(object): "Scanner for the regular expression source string." def __init__(self, string): if isinstance(string, unicode): self.string = string self.char_type = unichr else: self.string = string self.char_type = chr self.pos = 0 self.ignore_space = False self.sep = string[ : 0] def get(self): string = self.string pos = self.pos try: if self.ignore_space: while True: if string[pos].isspace(): # Skip over the whitespace. pos += 1 elif string[pos] == "#": # Skip over the comment to the end of the line. pos = string.index("\n", pos) else: break ch = string[pos] self.pos = pos + 1 return ch except IndexError: # We've reached the end of the string. self.pos = pos return string[ : 0] except ValueError: # The comment extended to the end of the string. self.pos = len(string) return string[ : 0] def get_many(self, count=1): string = self.string pos = self.pos try: if self.ignore_space: substring = [] while len(substring) < count: while True: if string[pos].isspace(): # Skip over the whitespace. pos += 1 elif string[pos] == "#": # Skip over the comment to the end of the line. pos = string.index("\n", pos) else: break substring.append(string[pos]) pos += 1 substring = "".join(substring) else: substring = string[pos : pos + count] pos += len(substring) self.pos = pos return substring except IndexError: # We've reached the end of the string. self.pos = len(string) return "".join(substring) except ValueError: # The comment extended to the end of the string. self.pos = len(string) return "".join(substring) def get_while(self, test_set, include=True): string = self.string pos = self.pos if self.ignore_space: try: substring = [] while True: if string[pos].isspace(): # Skip over the whitespace. pos += 1 elif string[pos] == "#": # Skip over the comment to the end of the line. pos = string.index("\n", pos) elif (string[pos] in test_set) == include: substring.append(string[pos]) pos += 1 else: break self.pos = pos except IndexError: # We've reached the end of the string. self.pos = len(string) except ValueError: # The comment extended to the end of the string. self.pos = len(string) return "".join(substring) else: try: while (string[pos] in test_set) == include: pos += 1 substring = string[self.pos : pos] self.pos = pos return substring except IndexError: # We've reached the end of the string. substring = string[self.pos : pos] self.pos = pos return substring def skip_while(self, test_set, include=True): string = self.string pos = self.pos try: if self.ignore_space: while True: if string[pos].isspace(): # Skip over the whitespace. pos += 1 elif string[pos] == "#": # Skip over the comment to the end of the line. pos = string.index("\n", pos) elif (string[pos] in test_set) == include: pos += 1 else: break else: while (string[pos] in test_set) == include: pos += 1 self.pos = pos except IndexError: # We've reached the end of the string. self.pos = len(string) except ValueError: # The comment extended to the end of the string. self.pos = len(string) def match(self, substring): string = self.string pos = self.pos if self.ignore_space: try: for c in substring: while True: if string[pos].isspace(): # Skip over the whitespace. pos += 1 elif string[pos] == "#": # Skip over the comment to the end of the line. pos = string.index("\n", pos) else: break if string[pos] != c: return False pos += 1 self.pos = pos return True except IndexError: # We've reached the end of the string. return False except ValueError: # The comment extended to the end of the string. return False else: if not string.startswith(substring, pos): return False self.pos = pos + len(substring) return True def expect(self, substring): if not self.match(substring): raise error("missing %s at position %d" % (substring, self.pos)) def at_end(self): string = self.string pos = self.pos try: if self.ignore_space: while True: if string[pos].isspace(): pos += 1 elif string[pos] == "#": pos = string.index("\n", pos) else: break return pos >= len(string) except IndexError: # We've reached the end of the string. return True except ValueError: # The comment extended to the end of the string. return True class Info(object): "Info about the regular expression." def __init__(self, flags=0, char_type=None, kwargs={}): flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] self.flags = flags self.global_flags = flags self.kwargs = kwargs self.group_count = 0 self.group_index = {} self.group_name = {} self.char_type = char_type self.named_lists_used = {} self.open_groups = [] self.open_group_count = {} self.defined_groups = {} self.group_calls = [] self.private_groups = {} def open_group(self, name=None): group = self.group_index.get(name) if group is None: while True: self.group_count += 1 if name is None or self.group_count not in self.group_name: break group = self.group_count if name: self.group_index[name] = group self.group_name[group] = name if group in self.open_groups: # We have a nested named group. We'll assign it a private group # number, initially negative until we can assign a proper # (positive) number. group_alias = -(len(self.private_groups) + 1) self.private_groups[group_alias] = group group = group_alias self.open_groups.append(group) self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 return group def close_group(self): self.open_groups.pop() def is_open_group(self, name): # In version 1, a group reference can refer to an open group. We'll # just pretend the group isn't open. version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION if version == VERSION1: return False if name.isdigit(): group = int(name) else: group = self.group_index.get(name) return group in self.open_groups def _check_group_features(info, parsed): """Checks whether the reverse and fuzzy features of the group calls match the groups which they call. """ call_refs = {} additional_groups = [] for call, reverse, fuzzy in info.group_calls: # Look up the reference of this group call. key = (call.group, reverse, fuzzy) ref = call_refs.get(key) if ref is None: # This group doesn't have a reference yet, so look up its features. if call.group == 0: # Calling the pattern as a whole. rev = bool(info.flags & REVERSE) fuz = isinstance(parsed, Fuzzy) if (rev, fuz) != (reverse, fuzzy): # The pattern as a whole doesn't have the features we want, # so we'll need to make a copy of it with the desired # features. additional_groups.append((parsed, reverse, fuzzy)) else: # Calling a capture group. def_info = info.defined_groups[call.group] group = def_info[0] if def_info[1 : ] != (reverse, fuzzy): # The group doesn't have the features we want, so we'll # need to make a copy of it with the desired features. additional_groups.append((group, reverse, fuzzy)) ref = len(call_refs) call_refs[key] = ref call.call_ref = ref info.call_refs = call_refs info.additional_groups = additional_groups def _get_required_string(parsed, flags): "Gets the required string and related info of a parsed pattern." req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) if required: required.required = True if req_offset >= UNLIMITED: req_offset = -1 req_flags = required.case_flags if not (flags & UNICODE): req_flags &= ~UNICODE req_chars = required.folded_characters else: req_offset = 0 req_chars = () req_flags = 0 return req_offset, req_chars, req_flags class Scanner: def __init__(self, lexicon, flags=0): self.lexicon = lexicon # Combine phrases into a compound pattern. patterns = [] for phrase, action in lexicon: # Parse the regular expression. source = Source(phrase) info = Info(flags, source.char_type) source.ignore_space = bool(info.flags & VERBOSE) parsed = _parse_pattern(source, info) if not source.at_end(): raise error("trailing characters at position %d" % source.pos) # We want to forbid capture groups within each phrase. patterns.append(parsed.remove_captures()) # Combine all the subpatterns into one pattern. info = Info(flags) patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] parsed = Branch(patterns) # Optimise the compound pattern. parsed = parsed.optimise(info) parsed = parsed.pack_characters(info) # Get the required string. req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags) # Check the features of the groups. _check_group_features(info, parsed) # Complain if there are any group calls. They are not supported by the # Scanner class. if info.call_refs: raise error("recursive regex not supported by Scanner") reverse = bool(info.flags & REVERSE) # Compile the compound pattern. The result is a list of tuples. code = parsed.compile(reverse) + [(OP.SUCCESS, )] # Flatten the code into a list of ints. code = _flatten_code(code) if not parsed.has_simple_start(): # Get the first set, if possible. try: fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) fs_code = _flatten_code(fs_code) code = fs_code + code except _FirstSetError: pass # Check the global flags for conflicts. version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION if version not in (0, VERSION0, VERSION1): raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") # Create the PatternObject. # # Local flags like IGNORECASE affect the code generation, but aren't # needed by the PatternObject itself. Conversely, global flags like # LOCALE _don't_ affect the code generation but _are_ needed by the # PatternObject. self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, code, {}, {}, {}, [], req_offset, req_chars, req_flags, len(patterns)) def scan(self, string): result = [] append = result.append match = self.scanner.scanner(string).match i = 0 while True: m = match() if not m: break j = m.end() if i == j: break action = self.lexicon[m.lastindex - 1][1] if hasattr(action, '__call__'): self.match = m action = action(self, m.group()) if action is not None: append(action) i = j return result, string[i : ] # Get the known properties dict. PROPERTIES = _regex.get_properties() # Build the inverse of the properties dict. PROPERTY_NAMES = {} for prop_name, (prop_id, values) in PROPERTIES.items(): name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) name = max(name, prop_name, key=len) PROPERTY_NAMES[prop_id] = name, prop_values for val_name, val_id in values.items(): prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, key=len) # Character escape sequences. CHARACTER_ESCAPES = { "a": "\a", "b": "\b", "f": "\f", "n": "\n", "r": "\r", "t": "\t", "v": "\v", } # Predefined character set escape sequences. CHARSET_ESCAPES = { "d": lookup_property(None, "Digit", True), "D": lookup_property(None, "Digit", False), "s": lookup_property(None, "Space", True), "S": lookup_property(None, "Space", False), "w": lookup_property(None, "Word", True), "W": lookup_property(None, "Word", False), } # Positional escape sequences. POSITION_ESCAPES = { "A": StartOfString(), "b": Boundary(), "B": Boundary(False), "m": StartOfWord(), "M": EndOfWord(), "Z": EndOfString(), } # Positional escape sequences when WORD flag set. WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) WORD_POSITION_ESCAPES.update({ "b": DefaultBoundary(), "B": DefaultBoundary(False), "m": DefaultStartOfWord(), "M": DefaultEndOfWord(), })