326 lines
13 KiB
Python
326 lines
13 KiB
Python
"""Nodes that make up parse trees
|
|
|
|
Parsing spits out a tree of these, which you can then tell to walk itself and
|
|
spit out a useful value. Or you can walk it yourself; the structural attributes
|
|
are public.
|
|
|
|
"""
|
|
# TODO: If this is slow, think about using cElementTree or something.
|
|
from inspect import isfunction
|
|
from sys import version_info, exc_info
|
|
|
|
from parsimonious.exceptions import VisitationError, UndefinedLabel
|
|
|
|
|
|
class Node(object):
|
|
"""A parse tree node
|
|
|
|
Consider these immutable once constructed. As a side effect of a
|
|
memory-saving strategy in the cache, multiple references to a single
|
|
``Node`` might be returned in a single parse tree. So, if you start
|
|
messing with one, you'll see surprising parallel changes pop up elsewhere.
|
|
|
|
My philosophy is that parse trees (and their nodes) should be
|
|
representation-agnostic. That is, they shouldn't get all mixed up with what
|
|
the final rendered form of a wiki page (or the intermediate representation
|
|
of a programming language, or whatever) is going to be: you should be able
|
|
to parse once and render several representations from the tree, one after
|
|
another.
|
|
|
|
"""
|
|
# I tried making this subclass list, but it got ugly. I had to construct
|
|
# invalid ones and patch them up later, and there were other problems.
|
|
__slots__ = ['expr', # The expression that generated me
|
|
'full_text', # The full text fed to the parser
|
|
'start', # The position in the text where that expr started matching
|
|
'end', # The position after start where the expr first didn't
|
|
# match. [start:end] follow Python slice conventions.
|
|
'children'] # List of child parse tree nodes
|
|
|
|
def __init__(self, expr, full_text, start, end, children=None):
|
|
self.expr = expr
|
|
self.full_text = full_text
|
|
self.start = start
|
|
self.end = end
|
|
self.children = children or []
|
|
|
|
@property
|
|
def expr_name(self):
|
|
# backwards compatibility
|
|
return self.expr.name
|
|
|
|
def __iter__(self):
|
|
"""Support looping over my children and doing tuple unpacks on me.
|
|
|
|
It can be very handy to unpack nodes in arg lists; see
|
|
:class:`PegVisitor` for an example.
|
|
|
|
"""
|
|
return iter(self.children)
|
|
|
|
@property
|
|
def text(self):
|
|
"""Return the text this node matched."""
|
|
return self.full_text[self.start:self.end]
|
|
|
|
# From here down is just stuff for testing and debugging.
|
|
|
|
def prettily(self, error=None):
|
|
"""Return a unicode, pretty-printed representation of me.
|
|
|
|
:arg error: The node to highlight because an error occurred there
|
|
|
|
"""
|
|
# TODO: If a Node appears multiple times in the tree, we'll point to
|
|
# them all. Whoops.
|
|
def indent(text):
|
|
return '\n'.join((' ' + line) for line in text.splitlines())
|
|
ret = [u'<%s%s matching "%s">%s' % (
|
|
self.__class__.__name__,
|
|
(' called "%s"' % self.expr_name) if self.expr_name else '',
|
|
self.text,
|
|
' <-- *** We were here. ***' if error is self else '')]
|
|
for n in self:
|
|
ret.append(indent(n.prettily(error=error)))
|
|
return '\n'.join(ret)
|
|
|
|
def __str__(self):
|
|
"""Return a compact, human-readable representation of me."""
|
|
return self.prettily()
|
|
|
|
def __eq__(self, other):
|
|
"""Support by-value deep comparison with other nodes for testing."""
|
|
if not isinstance(other, Node):
|
|
return NotImplemented
|
|
|
|
return (self.expr == other.expr and
|
|
self.full_text == other.full_text and
|
|
self.start == other.start and
|
|
self.end == other.end and
|
|
self.children == other.children)
|
|
|
|
def __ne__(self, other):
|
|
return not self == other
|
|
|
|
def __repr__(self, top_level=True):
|
|
"""Return a bit of code (though not an expression) that will recreate
|
|
me."""
|
|
# repr() of unicode flattens everything out to ASCII, so we don't need
|
|
# to explicitly encode things afterward.
|
|
ret = ["s = %r" % self.full_text] if top_level else []
|
|
ret.append("%s(%r, s, %s, %s%s)" % (
|
|
self.__class__.__name__,
|
|
self.expr,
|
|
self.start,
|
|
self.end,
|
|
(', children=[%s]' %
|
|
', '.join([c.__repr__(top_level=False) for c in self.children]))
|
|
if self.children else ''))
|
|
return '\n'.join(ret)
|
|
|
|
|
|
class RegexNode(Node):
|
|
"""Node returned from a ``Regex`` expression
|
|
|
|
Grants access to the ``re.Match`` object, in case you want to access
|
|
capturing groups, etc.
|
|
|
|
"""
|
|
__slots__ = ['match']
|
|
|
|
|
|
class RuleDecoratorMeta(type):
|
|
def __new__(metaclass, name, bases, namespace):
|
|
def unvisit(name):
|
|
"""Remove any leading "visit_" from a method name."""
|
|
return name[6:] if name.startswith('visit_') else name
|
|
|
|
methods = [v for k, v in namespace.items() if
|
|
hasattr(v, '_rule') and isfunction(v)]
|
|
if methods:
|
|
from parsimonious.grammar import Grammar # circular import dodge
|
|
|
|
methods.sort(key=(lambda x: x.func_code.co_firstlineno)
|
|
if version_info[0] < 3 else
|
|
(lambda x: x.__code__.co_firstlineno))
|
|
# Possible enhancement: once we get the Grammar extensibility story
|
|
# solidified, we can have @rules *add* to the default grammar
|
|
# rather than pave over it.
|
|
namespace['grammar'] = Grammar(
|
|
'\n'.join('{name} = {expr}'.format(name=unvisit(m.__name__),
|
|
expr=m._rule)
|
|
for m in methods))
|
|
return super(RuleDecoratorMeta,
|
|
metaclass).__new__(metaclass, name, bases, namespace)
|
|
|
|
|
|
class NodeVisitor(object, metaclass=RuleDecoratorMeta):
|
|
"""A shell for writing things that turn parse trees into something useful
|
|
|
|
Performs a depth-first traversal of an AST. Subclass this, add methods for
|
|
each expr you care about, instantiate, and call
|
|
``visit(top_node_of_parse_tree)``. It'll return the useful stuff. This API
|
|
is very similar to that of ``ast.NodeVisitor``.
|
|
|
|
These could easily all be static methods, but that would add at least as
|
|
much weirdness at the call site as the ``()`` for instantiation. And this
|
|
way, we support subclasses that require state: options, for example, or a
|
|
symbol table constructed from a programming language's AST.
|
|
|
|
We never transform the parse tree in place, because...
|
|
|
|
* There are likely multiple references to the same ``Node`` object in a
|
|
parse tree, and changes to one reference would surprise you elsewhere.
|
|
* It makes it impossible to report errors: you'd end up with the "error"
|
|
arrow pointing someplace in a half-transformed mishmash of nodes--and
|
|
that's assuming you're even transforming the tree into another tree.
|
|
Heaven forbid you're making it into a string or something else.
|
|
|
|
"""
|
|
|
|
#: The :term:`default grammar`: the one recommended for use with this
|
|
#: visitor. If you populate this, you will be able to call
|
|
#: :meth:`NodeVisitor.parse()` as a shortcut.
|
|
grammar = None
|
|
|
|
#: Classes of exceptions you actually intend to raise during visitation
|
|
#: and which should propagate out of the visitor. These will not be
|
|
#: wrapped in a VisitationError when they arise.
|
|
unwrapped_exceptions = ()
|
|
|
|
# TODO: If we need to optimize this, we can go back to putting subclasses
|
|
# in charge of visiting children; they know when not to bother. Or we can
|
|
# mark nodes as not descent-worthy in the grammar.
|
|
def visit(self, node):
|
|
"""Walk a parse tree, transforming it into another representation.
|
|
|
|
Recursively descend a parse tree, dispatching to the method named after
|
|
the rule in the :class:`~parsimonious.grammar.Grammar` that produced
|
|
each node. If, for example, a rule was... ::
|
|
|
|
bold = '<b>'
|
|
|
|
...the ``visit_bold()`` method would be called. It is your
|
|
responsibility to subclass :class:`NodeVisitor` and implement those
|
|
methods.
|
|
|
|
"""
|
|
method = getattr(self, 'visit_' + node.expr_name, self.generic_visit)
|
|
|
|
# Call that method, and show where in the tree it failed if it blows
|
|
# up.
|
|
try:
|
|
return method(node, [self.visit(n) for n in node])
|
|
except (VisitationError, UndefinedLabel):
|
|
# Don't catch and re-wrap already-wrapped exceptions.
|
|
raise
|
|
except Exception as exc:
|
|
# implentors may define exception classes that should not be
|
|
# wrapped.
|
|
if isinstance(exc, self.unwrapped_exceptions):
|
|
raise
|
|
# Catch any exception, and tack on a parse tree so it's easier to
|
|
# see where it went wrong.
|
|
exc_class = type(exc)
|
|
raise VisitationError(exc, exc_class, node) from exc
|
|
|
|
def generic_visit(self, node, visited_children):
|
|
"""Default visitor method
|
|
|
|
:arg node: The node we're visiting
|
|
:arg visited_children: The results of visiting the children of that
|
|
node, in a list
|
|
|
|
I'm not sure there's an implementation of this that makes sense across
|
|
all (or even most) use cases, so we leave it to subclasses to implement
|
|
for now.
|
|
|
|
"""
|
|
raise NotImplementedError('No visitor method was defined for this expression: %s' %
|
|
node.expr.as_rule())
|
|
|
|
# Convenience methods:
|
|
|
|
def parse(self, text, pos=0):
|
|
"""Parse some text with this Visitor's default grammar and return the
|
|
result of visiting it.
|
|
|
|
``SomeVisitor().parse('some_string')`` is a shortcut for
|
|
``SomeVisitor().visit(some_grammar.parse('some_string'))``.
|
|
|
|
"""
|
|
return self._parse_or_match(text, pos, 'parse')
|
|
|
|
def match(self, text, pos=0):
|
|
"""Parse and visit some text with this Visitor's default grammar, but
|
|
don't insist on parsing all the way to the end.
|
|
|
|
``SomeVisitor().match('some_string')`` is a shortcut for
|
|
``SomeVisitor().visit(some_grammar.match('some_string'))``.
|
|
|
|
"""
|
|
return self._parse_or_match(text, pos, 'match')
|
|
|
|
# Internal convenience methods to help you write your own visitors:
|
|
|
|
def lift_child(self, node, children):
|
|
"""Lift the sole child of ``node`` up to replace the node."""
|
|
first_child, = children
|
|
return first_child
|
|
|
|
# Private methods:
|
|
|
|
def _parse_or_match(self, text, pos, method_name):
|
|
"""Execute a parse or match on the default grammar, followed by a
|
|
visitation.
|
|
|
|
Raise RuntimeError if there is no default grammar specified.
|
|
|
|
"""
|
|
if not self.grammar:
|
|
raise RuntimeError(
|
|
"The {cls}.{method}() shortcut won't work because {cls} was "
|
|
"never associated with a specific " "grammar. Fill out its "
|
|
"`grammar` attribute, and try again.".format(
|
|
cls=self.__class__.__name__,
|
|
method=method_name))
|
|
return self.visit(getattr(self.grammar, method_name)(text, pos=pos))
|
|
|
|
|
|
def rule(rule_string):
|
|
"""Decorate a NodeVisitor ``visit_*`` method to tie a grammar rule to it.
|
|
|
|
The following will arrange for the ``visit_digit`` method to receive the
|
|
results of the ``~"[0-9]"`` parse rule::
|
|
|
|
@rule('~"[0-9]"')
|
|
def visit_digit(self, node, visited_children):
|
|
...
|
|
|
|
Notice that there is no "digit = " as part of the rule; that gets inferred
|
|
from the method name.
|
|
|
|
In cases where there is only one kind of visitor interested in a grammar,
|
|
using ``@rule`` saves you having to look back and forth between the visitor
|
|
and the grammar definition.
|
|
|
|
On an implementation level, all ``@rule`` rules get stitched together into
|
|
a :class:`~parsimonious.Grammar` that becomes the NodeVisitor's
|
|
:term:`default grammar`.
|
|
|
|
Typically, the choice of a default rule for this grammar is simple: whatever
|
|
``@rule`` comes first in the class is the default. But the choice may become
|
|
surprising if you divide the ``@rule`` calls among subclasses. At the
|
|
moment, which method "comes first" is decided simply by comparing line
|
|
numbers, so whatever method is on the smallest-numbered line will be the
|
|
default. In a future release, this will change to pick the
|
|
first ``@rule`` call on the basemost class that has one. That way, a
|
|
subclass which does not override the default rule's ``visit_*`` method
|
|
won't unintentionally change which rule is the default.
|
|
|
|
"""
|
|
def decorator(method):
|
|
method._rule = rule_string # XXX: Maybe register them on a class var instead so we can just override a @rule'd visitor method on a subclass without blowing away the rule string that comes with it.
|
|
return method
|
|
return decorator
|