Source code for pyparsing_regex._core

#!/usr/bin/python
# -*- coding: utf-8 -*-

import regex
import sys
import abc
from functools import partial

from schlichtanders.myobjects import Count, create_counter, Structure
import pyparsing_regex._helpers_regex as hre
from pprint import pformat

import cPickle
Count = create_counter() # does not work under cython

# Count = create_counter("Count") # this is unfortunately yet not pickable with pyximport

def deepcopy(o):
    """fast deepcopy alternative"""
    return cPickle.loads(cPickle.dumps(o, -1))

_MAX_INT = sys.maxint


# parser specific definitions
# ===========================

class ParserElementType(object):
    """ abstract class capturing the interface of an arbitrary ParserElement of pyparsing """
    __metaclass__ = abc.ABCMeta

    def __call__(self, name, **kwargs):
        return deepcopy(self).setResultsName(name, **kwargs)

    @abc.abstractmethod
    def setResultsName(self, name, **kwargs):
        """ kwargs are for compatibility with pyparsing interface """
        return self

    def setName(self, name):
        """ this is not yet implemented, so no easier output yet
        :param name:
        :return:
        """
        return self

    @abc.abstractmethod
    def suppress(self):
        """Suppresses the output of this C{ParserElement}; useful to keep punctuation from
           cluttering up returned output.

           change outer brackets, e.g. (...) or (?P<>...) or (?<>...) to non-capturing (?:...) version
        """
        return self

    @abc.abstractmethod
    def repeat(self, min=0, max=None):
        """ repititions is the main structural addition on top of the Structure-type """
        raise NotImplemented()

    def parseString(self, instring, parseAll=False):
        """Execute the parse expression with the given string.
        This is the main interface to the client code, once the complete
        expression has been built.

        If you want the grammar to require that the entire input string be
        successfully parsed, then set C{parseAll} to True (equivalent to ending
        the grammar with C{L{StringEnd()}}).

        Note: C{parseString} implicitly calls C{expandtabs()} on the input string,
        in order to report proper column numbers in parse actions.
        If the input string contains tabs and
        the grammar uses parse actions that use the C{loc} argument to index into the
        string being parsed, you can ensure you have a consistent view of the input
        string by:
        - calling C{parseWithTabs} on your grammar before calling C{parseString}
          (see L{I{parseWithTabs}<parseWithTabs>})
        - define your parse action using the full C{(s,loc,toks)} signature, and
          reference the input string using the parse action's C{s} argument
        - explictly expand the tabs in your input string before calling
          C{parseString}

        Return ParseResult!
        """
        return (self+StringEnd() if parseAll else self)._parseString(instring)

    @abc.abstractmethod
    def _parseString(self, instring):
        raise NotImplemented()


    def scanString(self, instring, maxMatches=_MAX_INT, overlap=False):
        """not supported: maxMatches

        Scan the input string for expression matches.  Each match will return the
        matching tokens, start location, and end location.  May be called with optional
        C{maxMatches} argument, to clip scanning after 'n' matches are found.  If
        C{overlap} is specified, then overlapping matches will be reported.

        Note that the start and end locations are reported relative to the string
        being parsed.  See L{I{parseString}<parseString>} for more information on parsing
        strings with embedded tabs.
        """
        i = 0
        matches = 0
        while i < len(instring) and matches < maxMatches:
            m = self.parseString(instring[i:])
            if m is not None:
                yield m
                matches += 1
                if overlap:
                    i += 1
                else:
                    i += m.parse_end
            else:
                i += 1


    def transformString(self, instring):
        raise NotImplementedError("with regular expressions setParseAction is not supported and thus also not transformString")
        #maybe later by using regex substitution

    def searchString(self, instring, maxMatches=_MAX_INT):
        """Another extension to C{L{scanString}}, simplifying the access to the tokens found
           to match the given parse expression.  May be called with optional
           C{maxMatches} argument, to clip searching after 'n' matches are found.
        """
        return list(self.scanString(instring))

    def __add__(self, other):
        base = deepcopy(self)
        base += other # (+=) == __iadd__
        return base

    @abc.abstractmethod
    def __iadd__(self, other):
        raise NotImplemented()

    def __or__(self, other):
        base = deepcopy(self)
        base |= other # (|=) == __ior__
        return base

    @abc.abstractmethod
    def __ior__(self, other):
        raise NotImplemented()

    def pprint(self):
        """not implemented in more detail"""
        return pformat(repr(self))


#: small helper classes for substructuring:
class Repeated(object):
    def __init__(self, count, structure):
        self.count = count
        self.structure = structure

    def __str__(self):
        return "Repeated{count: %s, structure: %s}" % (str(self.count), str(self.structure))


[docs]class ParserElement(ParserElementType): """ we can immitate arbitrarily complex formula directly by a single regex-string the output gets restructured (in linear time) to fulfil ParserElement/Structure interface not implemented: whitespaces support """ EMPTY = None # CONSTRUCTION # ============ def __init__(self, pattern, silent=False): if silent: # create empty Structure: self.structure = Structure() self.pattern = pattern self.name = self.pattern else: # create Count() Structure self.structure = Structure(Count()) self.pattern = hre.group(pattern) # for every Count() there must be a group self.name = self.pattern self._compiled = None # LOGIC # =====
[docs] def group(self, wrapper=None, pseudo=False, liftkeys=False, silent=None): # this is inplace: self.structure.group(wrapper, pseudo=pseudo, liftkeys=liftkeys) # normal grouping is done by Structure type, # but silent groups are nevertheless needed for correct regex semantics: if silent is None: pass # keep old self.pattern, this is mainly needed for pseudo groups like created for ResultNames elif silent: self.pattern = hre.ensure_grouping(self.pattern) else: self.pattern = hre.group(self.pattern) self._compiled = None return
[docs] def setResultsName(self, name, **kwargs): """ kwargs are for compatibility with pyparsing interface """ self.structure.set_name(name) return self
[docs] def setName(self, name): self.name = name return self
[docs] def suppress(self): """Suppresses the output of this C{ParserElement}; useful to keep punctuation from cluttering up returned output. change all inner brackets, e.g. (...) or (?P<>...) or (?<>...) to non-capturing (?:...) version CAUTION: NOT REVERSIBLE! """ self.pattern = hre.begins_not_silently_grouped.sub("(?:", self.pattern) self._compiled = None self.structure.clear() return self
[docs] def repeat(self, min=0, max=None): """ repeat on arbitrary ParserElement """ if max is not None and min > max: raise RuntimeError("min <= max needed") # if there is at most one real group in the pattern, # then there is no structure so far at all # and thus we do not have to group, but just can repeat # (mind by .suppress() there may also be zero real groups, which also don't have to be grouped) # additionally, there is also no need for a further nesting if the sub group was just repeated struct_iter = iter(self.structure) firstelem = next(struct_iter) try: next(struct_iter) struct_len_1 = False except StopIteration: struct_len_1 = True if struct_len_1 and isinstance(firstelem, Repeated): # prevent nested repeatings Repeat(Repeat) self.pattern = hre.ensure_grouping(self.pattern) else: # the grouping is done by wrapping into a Leaf, # so that we can construct a map function which does all restructuring of the regex output self.group( wrapper = lambda structure: Repeated(Count(), structure), # creates a complete Structure element pseudo = True, # pass everything through liftkeys = True, # pass everything through silent = False, # this adds a grouping level also in the pattern ) if max is None: self.pattern = r"%s{%s,}" % (self.pattern, min) elif min == max: self.pattern = r"%s{%s}" % (self.pattern, min) else: self.pattern = r"%s{%s,%s}" % (self.pattern, min, max) self._compiled = None
[docs] def compile(self): # compile regex (should optimize itself) self._compiled = regex.compile(self.pattern) return self._compiled
def _parseString(self, instring): """starts matchin at starts of ``instring`` - no search this is copying everything beforehand""" if self._compiled is None: self.compile() match = self._compiled.match(instring) if match is None: return None Count.reset() struct = deepcopy(self.structure) mymatch, substructs, preprocess_func = self._parse_preprocess(match) struct.map(preprocess_func) struct.map(self._func_parse_leaf(mymatch, substructs)) struct.parse_end = match.end() return struct @staticmethod def _parse_preprocess(match): """ evals Counts, transforms match, and dumps structures for repititions attention! match_transformed as first argument, substructs as third and function as last however match_transformed (substructs) are still empty initially and will be set by running the function :param match: to be transformed :return: match_transformed, substructs, preprocess_func """ match_transformed = [] substructs = {} #{Count: substruct} def preprocess_func(leaf): """ evaluates all Count instances so that they refer to fixed group """ if isinstance(leaf, Repeated): new_leaf = leaf.count.value # evaluates and stores value directly # CAUTION: +1 as we now start counting at 0, but regex start counting at 1 for groups match_transformed.append(match.ends(new_leaf + 1)) # recursive call leaf.structure.map(preprocess_func) # from here on everything is executed depth first (by recursion) substructs[new_leaf] = leaf.structure # elif isinstance(leaf, Count): else: #there should be no other case new_leaf = leaf.value # evaluates and stores value directly # CAUTION: +1 as we now start counting at 0, but regex start counting at 1 for groups match_transformed.append((match.ends(new_leaf + 1), match.captures(new_leaf + 1))) return new_leaf # new_leaf is int return match_transformed, substructs, preprocess_func @staticmethod def _func_parse_leaf(mymatch, substructs): """ CAUTION: for this map to work correctly, every Count instance must be evaluated and directly available (recursively!) i.e. first map ParserElement._recursive_evalcount """ def recursive_parse(leaf, maxend=None): try: # Repeated structure substruct = substructs[leaf] # """ def gen(): if maxend is None: for end in mymatch[leaf]: # repeated elements have ends while leafs have captures yield substruct.map(partial(recursive_parse, maxend=end), inplace=False) else: for i, end in enumerate(mymatch[leaf]): if end > maxend: del mymatch[leaf][:i] #delete everything parsed so far break yield substruct.map(partial(recursive_parse, maxend=end), inplace=False) # return simple list, which is flattened out automatically # (same effect as pseudo structure, however one could process this repetitions further, # e.g. keeping only last repition like it is done in pyparsing for default) return list(gen()) except KeyError: # base case: # this is always a single entry # we have to check ends as optional fields might get skipped and do not appear at all in ends/captures ends, captures = mymatch[leaf] if not ends: # nothing matched at all on this entry return ParserElement.EMPTY elif ends[0] <= maxend or maxend is None: # some matches within subrange del ends[0] return captures.pop(0) else: # matches, but not within this current subrange return ParserElement.EMPTY return recursive_parse def __iadd__(self, other): if isinstance(other, basestring): other = ParserElement(regex.escape(other)) self.structure += other.structure self.pattern += other.pattern self.name += other.name self._compiled = None return self def __radd__(self, other): if isinstance(other, basestring): other = ParserElement(regex.escape(other)) other += self #__iadd__ return other def __ior__(self, other): # TODO I think there is some crucial error in this OR construction # related to fact, that in regex an additional or gets an additional Count, # however, such Counts getting empty because another branch was used, should # usually not appear in the output, but just get ommitted # - more booktracking needed # TODO e.g. Delim.join_optional does not seem to work with pyparsing_regex # as the OR construction builds new Count(). There is something for this, # namely OR-construction with same group-number, # however one would have to indicate this here, which is not done in general # it seems to be implementation detail for pyparsing-regex unfortunately... if isinstance(other, basestring): other = ParserElement(regex.escape(other)) self.structure += other.structure self.pattern += "|" + other.pattern self.name += "|" + other.name self.group(pseudo = True, liftkeys = True, silent = True) return self def __ror__(self, other): if isinstance(other, basestring): other = ParserElement(regex.escape(other)) other |= self #__ior__ return other def __str__(self): # [] are for pprint, () would make more sense return "['%s', %s]" % (self.name, str(self.structure)) def __repr__(self): # [] are for pprint, () would make more sense return "['%s', %s, r'%s']" % (self.name, repr(self.structure), self.pattern)
# copy for non-cyclic imports: class StringEnd(ParserElement): def __init__(self): """matches the end of the text""" super(StringEnd, self).__init__(r"$")