#!/usr/bin/python
# -*- coding: utf-8 -*-
import regex
import sys
import abc
from functools import partial
from schlichtanders.myobjects import Count, create_counter, Structure
import pyparsing_regex._helpers_regex as hre
from pprint import pformat
import cPickle
Count = create_counter() # does not work under cython
# Count = create_counter("Count") # this is unfortunately yet not pickable with pyximport
def deepcopy(o):
"""fast deepcopy alternative"""
return cPickle.loads(cPickle.dumps(o, -1))
_MAX_INT = sys.maxint
# parser specific definitions
# ===========================
class ParserElementType(object):
""" abstract class capturing the interface of an arbitrary ParserElement of pyparsing """
__metaclass__ = abc.ABCMeta
def __call__(self, name, **kwargs):
return deepcopy(self).setResultsName(name, **kwargs)
@abc.abstractmethod
def setResultsName(self, name, **kwargs):
""" kwargs are for compatibility with pyparsing interface """
return self
def setName(self, name):
""" this is not yet implemented, so no easier output yet
:param name:
:return:
"""
return self
@abc.abstractmethod
def suppress(self):
"""Suppresses the output of this C{ParserElement}; useful to keep punctuation from
cluttering up returned output.
change outer brackets, e.g. (...) or (?P<>...) or (?<>...) to non-capturing (?:...) version
"""
return self
@abc.abstractmethod
def repeat(self, min=0, max=None):
""" repititions is the main structural addition on top of the Structure-type """
raise NotImplemented()
def parseString(self, instring, parseAll=False):
"""Execute the parse expression with the given string.
This is the main interface to the client code, once the complete
expression has been built.
If you want the grammar to require that the entire input string be
successfully parsed, then set C{parseAll} to True (equivalent to ending
the grammar with C{L{StringEnd()}}).
Note: C{parseString} implicitly calls C{expandtabs()} on the input string,
in order to report proper column numbers in parse actions.
If the input string contains tabs and
the grammar uses parse actions that use the C{loc} argument to index into the
string being parsed, you can ensure you have a consistent view of the input
string by:
- calling C{parseWithTabs} on your grammar before calling C{parseString}
(see L{I{parseWithTabs}<parseWithTabs>})
- define your parse action using the full C{(s,loc,toks)} signature, and
reference the input string using the parse action's C{s} argument
- explictly expand the tabs in your input string before calling
C{parseString}
Return ParseResult!
"""
return (self+StringEnd() if parseAll else self)._parseString(instring)
@abc.abstractmethod
def _parseString(self, instring):
raise NotImplemented()
def scanString(self, instring, maxMatches=_MAX_INT, overlap=False):
"""not supported: maxMatches
Scan the input string for expression matches. Each match will return the
matching tokens, start location, and end location. May be called with optional
C{maxMatches} argument, to clip scanning after 'n' matches are found. If
C{overlap} is specified, then overlapping matches will be reported.
Note that the start and end locations are reported relative to the string
being parsed. See L{I{parseString}<parseString>} for more information on parsing
strings with embedded tabs.
"""
i = 0
matches = 0
while i < len(instring) and matches < maxMatches:
m = self.parseString(instring[i:])
if m is not None:
yield m
matches += 1
if overlap:
i += 1
else:
i += m.parse_end
else:
i += 1
def transformString(self, instring):
raise NotImplementedError("with regular expressions setParseAction is not supported and thus also not transformString")
#maybe later by using regex substitution
def searchString(self, instring, maxMatches=_MAX_INT):
"""Another extension to C{L{scanString}}, simplifying the access to the tokens found
to match the given parse expression. May be called with optional
C{maxMatches} argument, to clip searching after 'n' matches are found.
"""
return list(self.scanString(instring))
def __add__(self, other):
base = deepcopy(self)
base += other # (+=) == __iadd__
return base
@abc.abstractmethod
def __iadd__(self, other):
raise NotImplemented()
def __or__(self, other):
base = deepcopy(self)
base |= other # (|=) == __ior__
return base
@abc.abstractmethod
def __ior__(self, other):
raise NotImplemented()
def pprint(self):
"""not implemented in more detail"""
return pformat(repr(self))
#: small helper classes for substructuring:
class Repeated(object):
def __init__(self, count, structure):
self.count = count
self.structure = structure
def __str__(self):
return "Repeated{count: %s, structure: %s}" % (str(self.count), str(self.structure))
[docs]class ParserElement(ParserElementType):
"""
we can immitate arbitrarily complex formula directly by a single regex-string
the output gets restructured (in linear time) to fulfil ParserElement/Structure interface
not implemented: whitespaces support
"""
EMPTY = None
# CONSTRUCTION
# ============
def __init__(self, pattern, silent=False):
if silent:
# create empty Structure:
self.structure = Structure()
self.pattern = pattern
self.name = self.pattern
else:
# create Count() Structure
self.structure = Structure(Count())
self.pattern = hre.group(pattern) # for every Count() there must be a group
self.name = self.pattern
self._compiled = None
# LOGIC
# =====
[docs] def group(self, wrapper=None, pseudo=False, liftkeys=False, silent=None):
# this is inplace:
self.structure.group(wrapper, pseudo=pseudo, liftkeys=liftkeys)
# normal grouping is done by Structure type,
# but silent groups are nevertheless needed for correct regex semantics:
if silent is None:
pass # keep old self.pattern, this is mainly needed for pseudo groups like created for ResultNames
elif silent:
self.pattern = hre.ensure_grouping(self.pattern)
else:
self.pattern = hre.group(self.pattern)
self._compiled = None
return
[docs] def setResultsName(self, name, **kwargs):
""" kwargs are for compatibility with pyparsing interface """
self.structure.set_name(name)
return self
[docs] def setName(self, name):
self.name = name
return self
[docs] def suppress(self):
"""Suppresses the output of this C{ParserElement}; useful to keep punctuation from
cluttering up returned output.
change all inner brackets, e.g. (...) or (?P<>...) or (?<>...) to non-capturing (?:...) version
CAUTION: NOT REVERSIBLE!
"""
self.pattern = hre.begins_not_silently_grouped.sub("(?:", self.pattern)
self._compiled = None
self.structure.clear()
return self
[docs] def repeat(self, min=0, max=None):
""" repeat on arbitrary ParserElement """
if max is not None and min > max:
raise RuntimeError("min <= max needed")
# if there is at most one real group in the pattern,
# then there is no structure so far at all
# and thus we do not have to group, but just can repeat
# (mind by .suppress() there may also be zero real groups, which also don't have to be grouped)
# additionally, there is also no need for a further nesting if the sub group was just repeated
struct_iter = iter(self.structure)
firstelem = next(struct_iter)
try:
next(struct_iter)
struct_len_1 = False
except StopIteration:
struct_len_1 = True
if struct_len_1 and isinstance(firstelem, Repeated):
# prevent nested repeatings Repeat(Repeat)
self.pattern = hre.ensure_grouping(self.pattern)
else:
# the grouping is done by wrapping into a Leaf,
# so that we can construct a map function which does all restructuring of the regex output
self.group(
wrapper = lambda structure: Repeated(Count(), structure), # creates a complete Structure element
pseudo = True, # pass everything through
liftkeys = True, # pass everything through
silent = False, # this adds a grouping level also in the pattern
)
if max is None:
self.pattern = r"%s{%s,}" % (self.pattern, min)
elif min == max:
self.pattern = r"%s{%s}" % (self.pattern, min)
else:
self.pattern = r"%s{%s,%s}" % (self.pattern, min, max)
self._compiled = None
[docs] def compile(self):
# compile regex (should optimize itself)
self._compiled = regex.compile(self.pattern)
return self._compiled
def _parseString(self, instring):
"""starts matchin at starts of ``instring`` - no search
this is copying everything beforehand"""
if self._compiled is None:
self.compile()
match = self._compiled.match(instring)
if match is None:
return None
Count.reset()
struct = deepcopy(self.structure)
mymatch, substructs, preprocess_func = self._parse_preprocess(match)
struct.map(preprocess_func)
struct.map(self._func_parse_leaf(mymatch, substructs))
struct.parse_end = match.end()
return struct
@staticmethod
def _parse_preprocess(match):
""" evals Counts, transforms match, and dumps structures for repititions
attention! match_transformed as first argument, substructs as third and function as last
however match_transformed (substructs) are still empty initially and will be set by running the
function
:param match: to be transformed
:return: match_transformed, substructs, preprocess_func
"""
match_transformed = []
substructs = {} #{Count: substruct}
def preprocess_func(leaf):
""" evaluates all Count instances so that they refer to fixed group """
if isinstance(leaf, Repeated):
new_leaf = leaf.count.value # evaluates and stores value directly
# CAUTION: +1 as we now start counting at 0, but regex start counting at 1 for groups
match_transformed.append(match.ends(new_leaf + 1))
# recursive call
leaf.structure.map(preprocess_func)
# from here on everything is executed depth first (by recursion)
substructs[new_leaf] = leaf.structure
# elif isinstance(leaf, Count):
else: #there should be no other case
new_leaf = leaf.value # evaluates and stores value directly
# CAUTION: +1 as we now start counting at 0, but regex start counting at 1 for groups
match_transformed.append((match.ends(new_leaf + 1), match.captures(new_leaf + 1)))
return new_leaf # new_leaf is int
return match_transformed, substructs, preprocess_func
@staticmethod
def _func_parse_leaf(mymatch, substructs):
"""
CAUTION: for this map to work correctly,
every Count instance must be evaluated and directly available (recursively!)
i.e. first map ParserElement._recursive_evalcount
"""
def recursive_parse(leaf, maxend=None):
try: # Repeated structure
substruct = substructs[leaf]
# """
def gen():
if maxend is None:
for end in mymatch[leaf]: # repeated elements have ends while leafs have captures
yield substruct.map(partial(recursive_parse, maxend=end), inplace=False)
else:
for i, end in enumerate(mymatch[leaf]):
if end > maxend:
del mymatch[leaf][:i] #delete everything parsed so far
break
yield substruct.map(partial(recursive_parse, maxend=end), inplace=False)
# return simple list, which is flattened out automatically
# (same effect as pseudo structure, however one could process this repetitions further,
# e.g. keeping only last repition like it is done in pyparsing for default)
return list(gen())
except KeyError: # base case:
# this is always a single entry
# we have to check ends as optional fields might get skipped and do not appear at all in ends/captures
ends, captures = mymatch[leaf]
if not ends: # nothing matched at all on this entry
return ParserElement.EMPTY
elif ends[0] <= maxend or maxend is None: # some matches within subrange
del ends[0]
return captures.pop(0)
else: # matches, but not within this current subrange
return ParserElement.EMPTY
return recursive_parse
def __iadd__(self, other):
if isinstance(other, basestring):
other = ParserElement(regex.escape(other))
self.structure += other.structure
self.pattern += other.pattern
self.name += other.name
self._compiled = None
return self
def __radd__(self, other):
if isinstance(other, basestring):
other = ParserElement(regex.escape(other))
other += self #__iadd__
return other
def __ior__(self, other):
# TODO I think there is some crucial error in this OR construction
# related to fact, that in regex an additional or gets an additional Count,
# however, such Counts getting empty because another branch was used, should
# usually not appear in the output, but just get ommitted
# - more booktracking needed
# TODO e.g. Delim.join_optional does not seem to work with pyparsing_regex
# as the OR construction builds new Count(). There is something for this,
# namely OR-construction with same group-number,
# however one would have to indicate this here, which is not done in general
# it seems to be implementation detail for pyparsing-regex unfortunately...
if isinstance(other, basestring):
other = ParserElement(regex.escape(other))
self.structure += other.structure
self.pattern += "|" + other.pattern
self.name += "|" + other.name
self.group(pseudo = True,
liftkeys = True,
silent = True)
return self
def __ror__(self, other):
if isinstance(other, basestring):
other = ParserElement(regex.escape(other))
other |= self #__ior__
return other
def __str__(self):
# [] are for pprint, () would make more sense
return "['%s', %s]" % (self.name, str(self.structure))
def __repr__(self):
# [] are for pprint, () would make more sense
return "['%s', %s, r'%s']" % (self.name, repr(self.structure), self.pattern)
# copy for non-cyclic imports:
class StringEnd(ParserElement):
def __init__(self):
"""matches the end of the text"""
super(StringEnd, self).__init__(r"$")