513 lines
20 KiB
Python
513 lines
20 KiB
Python
"""A convenience which constructs expression trees from an easy-to-read syntax
|
|
|
|
Use this unless you have a compelling reason not to; it performs some
|
|
optimizations that would be tedious to do when constructing an expression tree
|
|
by hand.
|
|
|
|
"""
|
|
from collections import OrderedDict
|
|
from textwrap import dedent
|
|
|
|
from parsimonious.exceptions import BadGrammar, UndefinedLabel
|
|
from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
|
|
Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,
|
|
expression, is_callable)
|
|
from parsimonious.nodes import NodeVisitor
|
|
from parsimonious.utils import evaluate_string
|
|
|
|
class Grammar(OrderedDict):
|
|
"""A collection of rules that describe a language
|
|
|
|
You can start parsing from the default rule by calling ``parse()``
|
|
directly on the ``Grammar`` object::
|
|
|
|
g = Grammar('''
|
|
polite_greeting = greeting ", my good " title
|
|
greeting = "Hi" / "Hello"
|
|
title = "madam" / "sir"
|
|
''')
|
|
g.parse('Hello, my good sir')
|
|
|
|
Or start parsing from any of the other rules; you can pull them out of the
|
|
grammar as if it were a dictionary::
|
|
|
|
g['title'].parse('sir')
|
|
|
|
You could also just construct a bunch of ``Expression`` objects yourself
|
|
and stitch them together into a language, but using a ``Grammar`` has some
|
|
important advantages:
|
|
|
|
* Languages are much easier to define in the nice syntax it provides.
|
|
* Circular references aren't a pain.
|
|
* It does all kinds of whizzy space- and time-saving optimizations, like
|
|
factoring up repeated subexpressions into a single object, which should
|
|
increase cache hit ratio. [Is this implemented yet?]
|
|
|
|
"""
|
|
def __init__(self, rules='', **more_rules):
|
|
"""Construct a grammar.
|
|
|
|
:arg rules: A string of production rules, one per line.
|
|
:arg default_rule: The name of the rule invoked when you call
|
|
:meth:`parse()` or :meth:`match()` on the grammar. Defaults to the
|
|
first rule. Falls back to None if there are no string-based rules
|
|
in this grammar.
|
|
:arg more_rules: Additional kwargs whose names are rule names and
|
|
values are Expressions or custom-coded callables which accomplish
|
|
things the built-in rule syntax cannot. These take precedence over
|
|
``rules`` in case of naming conflicts.
|
|
|
|
"""
|
|
|
|
decorated_custom_rules = {
|
|
k: (expression(v, k, self) if is_callable(v) else v)
|
|
for k, v in more_rules.items()}
|
|
|
|
exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)
|
|
super().__init__(exprs.items())
|
|
self.default_rule = first # may be None
|
|
|
|
def default(self, rule_name):
|
|
"""Return a new Grammar whose :term:`default rule` is ``rule_name``."""
|
|
new = self._copy()
|
|
new.default_rule = new[rule_name]
|
|
return new
|
|
|
|
def _copy(self):
|
|
"""Return a shallow copy of myself.
|
|
|
|
Deep is unnecessary, since Expression trees are immutable. Subgrammars
|
|
recreate all the Expressions from scratch, and AbstractGrammars have
|
|
no Expressions.
|
|
|
|
"""
|
|
new = Grammar.__new__(Grammar)
|
|
super(Grammar, new).__init__(self.items())
|
|
new.default_rule = self.default_rule
|
|
return new
|
|
|
|
def _expressions_from_rules(self, rules, custom_rules):
|
|
"""Return a 2-tuple: a dict of rule names pointing to their
|
|
expressions, and then the first rule.
|
|
|
|
It's a web of expressions, all referencing each other. Typically,
|
|
there's a single root to the web of references, and that root is the
|
|
starting symbol for parsing, but there's nothing saying you can't have
|
|
multiple roots.
|
|
|
|
:arg custom_rules: A map of rule names to custom-coded rules:
|
|
Expressions
|
|
|
|
"""
|
|
tree = rule_grammar.parse(rules)
|
|
return RuleVisitor(custom_rules).visit(tree)
|
|
|
|
def parse(self, text, pos=0):
|
|
"""Parse some text with the :term:`default rule`.
|
|
|
|
:arg pos: The index at which to start parsing
|
|
|
|
"""
|
|
self._check_default_rule()
|
|
return self.default_rule.parse(text, pos=pos)
|
|
|
|
def match(self, text, pos=0):
|
|
"""Parse some text with the :term:`default rule` but not necessarily
|
|
all the way to the end.
|
|
|
|
:arg pos: The index at which to start parsing
|
|
|
|
"""
|
|
self._check_default_rule()
|
|
return self.default_rule.match(text, pos=pos)
|
|
|
|
def _check_default_rule(self):
|
|
"""Raise RuntimeError if there is no default rule defined."""
|
|
if not self.default_rule:
|
|
raise RuntimeError("Can't call parse() on a Grammar that has no "
|
|
"default rule. Choose a specific rule instead, "
|
|
"like some_grammar['some_rule'].parse(...).")
|
|
|
|
def __str__(self):
|
|
"""Return a rule string that, when passed to the constructor, would
|
|
reconstitute the grammar."""
|
|
exprs = [self.default_rule] if self.default_rule else []
|
|
exprs.extend(expr for expr in self.values() if
|
|
expr is not self.default_rule)
|
|
return '\n'.join(expr.as_rule() for expr in exprs)
|
|
|
|
def __repr__(self):
|
|
"""Return an expression that will reconstitute the grammar."""
|
|
return "Grammar({!r})".format(str(self))
|
|
|
|
|
|
class TokenGrammar(Grammar):
|
|
"""A Grammar which takes a list of pre-lexed tokens instead of text
|
|
|
|
This is useful if you want to do the lexing yourself, as a separate pass:
|
|
for example, to implement indentation-based languages.
|
|
|
|
"""
|
|
def _expressions_from_rules(self, rules, custom_rules):
|
|
tree = rule_grammar.parse(rules)
|
|
return TokenRuleVisitor(custom_rules).visit(tree)
|
|
|
|
|
|
class BootstrappingGrammar(Grammar):
|
|
"""The grammar used to recognize the textual rules that describe other
|
|
grammars
|
|
|
|
This grammar gets its start from some hard-coded Expressions and claws its
|
|
way from there to an expression tree that describes how to parse the
|
|
grammar description syntax.
|
|
|
|
"""
|
|
def _expressions_from_rules(self, rule_syntax, custom_rules):
|
|
"""Return the rules for parsing the grammar definition syntax.
|
|
|
|
Return a 2-tuple: a dict of rule names pointing to their expressions,
|
|
and then the top-level expression for the first rule.
|
|
|
|
"""
|
|
# Hard-code enough of the rules to parse the grammar that describes the
|
|
# grammar description language, to bootstrap:
|
|
comment = Regex(r'#[^\r\n]*', name='comment')
|
|
meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
|
|
_ = ZeroOrMore(meaninglessness, name='_')
|
|
equals = Sequence(Literal('='), _, name='equals')
|
|
label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
|
|
reference = Sequence(label, Not(equals), name='reference')
|
|
quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
|
|
# This pattern supports empty literals. TODO: A problem?
|
|
spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
|
|
ignore_case=True,
|
|
dot_all=True,
|
|
name='spaceless_literal')
|
|
literal = Sequence(spaceless_literal, _, name='literal')
|
|
regex = Sequence(Literal('~'),
|
|
literal,
|
|
Regex('[ilmsuxa]*', ignore_case=True),
|
|
_,
|
|
name='regex')
|
|
atom = OneOf(reference, literal, regex, name='atom')
|
|
quantified = Sequence(atom, quantifier, name='quantified')
|
|
|
|
term = OneOf(quantified, atom, name='term')
|
|
not_term = Sequence(Literal('!'), term, _, name='not_term')
|
|
term.members = (not_term,) + term.members
|
|
|
|
sequence = Sequence(term, OneOrMore(term), name='sequence')
|
|
or_term = Sequence(Literal('/'), _, term, name='or_term')
|
|
ored = Sequence(term, OneOrMore(or_term), name='ored')
|
|
expression = OneOf(ored, sequence, term, name='expression')
|
|
rule = Sequence(label, equals, expression, name='rule')
|
|
rules = Sequence(_, OneOrMore(rule), name='rules')
|
|
|
|
# Use those hard-coded rules to parse the (more extensive) rule syntax.
|
|
# (For example, unless I start using parentheses in the rule language
|
|
# definition itself, I should never have to hard-code expressions for
|
|
# those above.)
|
|
|
|
rule_tree = rules.parse(rule_syntax)
|
|
|
|
# Turn the parse tree into a map of expressions:
|
|
return RuleVisitor().visit(rule_tree)
|
|
|
|
|
|
# The grammar for parsing PEG grammar definitions:
|
|
# This is a nice, simple grammar. We may someday add to it, but it's a safe bet
|
|
# that the future will always be a superset of this.
|
|
rule_syntax = (r'''
|
|
# Ignored things (represented by _) are typically hung off the end of the
|
|
# leafmost kinds of nodes. Literals like "/" count as leaves.
|
|
|
|
rules = _ rule*
|
|
rule = label equals expression
|
|
equals = "=" _
|
|
literal = spaceless_literal _
|
|
|
|
# So you can't spell a regex like `~"..." ilm`:
|
|
spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /
|
|
~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is
|
|
|
|
expression = ored / sequence / term
|
|
or_term = "/" _ term
|
|
ored = term or_term+
|
|
sequence = term term+
|
|
not_term = "!" term _
|
|
lookahead_term = "&" term _
|
|
term = not_term / lookahead_term / quantified / atom
|
|
quantified = atom quantifier
|
|
atom = reference / literal / regex / parenthesized
|
|
regex = "~" spaceless_literal ~"[ilmsuxa]*"i _
|
|
parenthesized = "(" _ expression ")" _
|
|
quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _
|
|
reference = label !equals
|
|
|
|
# A subsequent equal sign is the only thing that distinguishes a label
|
|
# (which begins a new rule) from a reference (which is just a pointer to a
|
|
# rule defined somewhere else):
|
|
label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _
|
|
|
|
# _ = ~r"\s*(?:#[^\r\n]*)?\s*"
|
|
_ = meaninglessness*
|
|
meaninglessness = ~r"\s+" / comment
|
|
comment = ~r"#[^\r\n]*"
|
|
''')
|
|
|
|
|
|
class LazyReference(str):
|
|
"""A lazy reference to a rule, which we resolve after grokking all the
|
|
rules"""
|
|
|
|
name = ''
|
|
|
|
def resolve_refs(self, rule_map):
|
|
"""
|
|
Traverse the rule map following top-level lazy references,
|
|
until we reach a cycle (raise an error) or a concrete expression.
|
|
|
|
For example, the following is a circular reference:
|
|
foo = bar
|
|
baz = foo2
|
|
foo2 = foo
|
|
|
|
Note that every RHS of a grammar rule _must_ be either a
|
|
LazyReference or a concrete expression, so the reference chain will
|
|
eventually either terminate or find a cycle.
|
|
"""
|
|
seen = set()
|
|
cur = self
|
|
while True:
|
|
if cur in seen:
|
|
raise BadGrammar(f"Circular Reference resolving {self.name}={self}.")
|
|
else:
|
|
seen.add(cur)
|
|
try:
|
|
cur = rule_map[str(cur)]
|
|
except KeyError:
|
|
raise UndefinedLabel(cur)
|
|
if not isinstance(cur, LazyReference):
|
|
return cur
|
|
|
|
# Just for debugging:
|
|
def _as_rhs(self):
|
|
return '<LazyReference to %s>' % self
|
|
|
|
|
|
class RuleVisitor(NodeVisitor):
|
|
"""Turns a parse tree of a grammar definition into a map of ``Expression``
|
|
objects
|
|
|
|
This is the magic piece that breathes life into a parsed bunch of parse
|
|
rules, allowing them to go forth and parse other things.
|
|
|
|
"""
|
|
quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}
|
|
|
|
visit_expression = visit_term = visit_atom = NodeVisitor.lift_child
|
|
|
|
def __init__(self, custom_rules=None):
|
|
"""Construct.
|
|
|
|
:arg custom_rules: A dict of {rule name: expression} holding custom
|
|
rules which will take precedence over the others
|
|
|
|
"""
|
|
self.custom_rules = custom_rules or {}
|
|
self._last_literal_node_and_type = None
|
|
|
|
def visit_parenthesized(self, node, parenthesized):
|
|
"""Treat a parenthesized subexpression as just its contents.
|
|
|
|
Its position in the tree suffices to maintain its grouping semantics.
|
|
|
|
"""
|
|
left_paren, _, expression, right_paren, _ = parenthesized
|
|
return expression
|
|
|
|
def visit_quantifier(self, node, quantifier):
|
|
"""Turn a quantifier into just its symbol-matching node."""
|
|
symbol, _ = quantifier
|
|
return symbol
|
|
|
|
def visit_quantified(self, node, quantified):
|
|
atom, quantifier = quantified
|
|
try:
|
|
return self.quantifier_classes[quantifier.text](atom)
|
|
except KeyError:
|
|
# This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier)
|
|
quantifier = quantifier.text[1:-1].split(",")
|
|
if len(quantifier) == 1:
|
|
min_match = max_match = int(quantifier[0])
|
|
else:
|
|
min_match = int(quantifier[0]) if quantifier[0] else 0
|
|
max_match = int(quantifier[1]) if quantifier[1] else float('inf')
|
|
return Quantifier(atom, min=min_match, max=max_match)
|
|
|
|
def visit_lookahead_term(self, node, lookahead_term):
|
|
ampersand, term, _ = lookahead_term
|
|
return Lookahead(term)
|
|
|
|
def visit_not_term(self, node, not_term):
|
|
exclamation, term, _ = not_term
|
|
return Not(term)
|
|
|
|
def visit_rule(self, node, rule):
|
|
"""Assign a name to the Expression and return it."""
|
|
label, equals, expression = rule
|
|
expression.name = label # Assign a name to the expr.
|
|
return expression
|
|
|
|
def visit_sequence(self, node, sequence):
|
|
"""A parsed Sequence looks like [term node, OneOrMore node of
|
|
``another_term``s]. Flatten it out."""
|
|
term, other_terms = sequence
|
|
return Sequence(term, *other_terms)
|
|
|
|
def visit_ored(self, node, ored):
|
|
first_term, other_terms = ored
|
|
return OneOf(first_term, *other_terms)
|
|
|
|
def visit_or_term(self, node, or_term):
|
|
"""Return just the term from an ``or_term``.
|
|
|
|
We already know it's going to be ored, from the containing ``ored``.
|
|
|
|
"""
|
|
slash, _, term = or_term
|
|
return term
|
|
|
|
def visit_label(self, node, label):
|
|
"""Turn a label into a unicode string."""
|
|
name, _ = label
|
|
return name.text
|
|
|
|
def visit_reference(self, node, reference):
|
|
"""Stick a :class:`LazyReference` in the tree as a placeholder.
|
|
|
|
We resolve them all later.
|
|
|
|
"""
|
|
label, not_equals = reference
|
|
return LazyReference(label)
|
|
|
|
def visit_regex(self, node, regex):
|
|
"""Return a ``Regex`` expression."""
|
|
tilde, literal, flags, _ = regex
|
|
flags = flags.text.upper()
|
|
pattern = literal.literal # Pull the string back out of the Literal
|
|
# object.
|
|
return Regex(pattern, ignore_case='I' in flags,
|
|
locale='L' in flags,
|
|
multiline='M' in flags,
|
|
dot_all='S' in flags,
|
|
unicode='U' in flags,
|
|
verbose='X' in flags,
|
|
ascii='A' in flags)
|
|
|
|
def visit_spaceless_literal(self, spaceless_literal, visited_children):
|
|
"""Turn a string literal into a ``Literal`` that recognizes it."""
|
|
literal_value = evaluate_string(spaceless_literal.text)
|
|
if self._last_literal_node_and_type:
|
|
last_node, last_type = self._last_literal_node_and_type
|
|
if last_type != type(literal_value):
|
|
raise BadGrammar(dedent(f"""\
|
|
Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals.
|
|
All strings in a single grammar must be of the same type.
|
|
""")
|
|
)
|
|
|
|
self._last_literal_node_and_type = spaceless_literal, type(literal_value)
|
|
|
|
return Literal(literal_value)
|
|
|
|
def visit_literal(self, node, literal):
|
|
"""Pick just the literal out of a literal-and-junk combo."""
|
|
spaceless_literal, _ = literal
|
|
return spaceless_literal
|
|
|
|
def generic_visit(self, node, visited_children):
|
|
"""Replace childbearing nodes with a list of their children; keep
|
|
others untouched.
|
|
|
|
For our case, if a node has children, only the children are important.
|
|
Otherwise, keep the node around for (for example) the flags of the
|
|
regex rule. Most of these kept-around nodes are subsequently thrown
|
|
away by the other visitor methods.
|
|
|
|
We can't simply hang the visited children off the original node; that
|
|
would be disastrous if the node occurred in more than one place in the
|
|
tree.
|
|
|
|
"""
|
|
return visited_children or node # should semantically be a tuple
|
|
|
|
def visit_rules(self, node, rules_list):
|
|
"""Collate all the rules into a map. Return (map, default rule).
|
|
|
|
The default rule is the first one. Or, if you have more than one rule
|
|
of that name, it's the last-occurring rule of that name. (This lets you
|
|
override the default rule when you extend a grammar.) If there are no
|
|
string-based rules, the default rule is None, because the custom rules,
|
|
due to being kwarg-based, are unordered.
|
|
|
|
"""
|
|
_, rules = rules_list
|
|
|
|
# Map each rule's name to its Expression. Later rules of the same name
|
|
# override earlier ones. This lets us define rules multiple times and
|
|
# have the last declaration win, so you can extend grammars by
|
|
# concatenation.
|
|
rule_map = OrderedDict((expr.name, expr) for expr in rules)
|
|
|
|
# And custom rules override string-based rules. This is the least
|
|
# surprising choice when you compare the dict constructor:
|
|
# dict({'x': 5}, x=6).
|
|
rule_map.update(self.custom_rules)
|
|
|
|
# Resolve references. This tolerates forward references.
|
|
for name, rule in list(rule_map.items()):
|
|
if hasattr(rule, 'resolve_refs'):
|
|
# Some custom rules may not define a resolve_refs method,
|
|
# though anything that inherits from Expression will have it.
|
|
rule_map[name] = rule.resolve_refs(rule_map)
|
|
|
|
# isinstance() is a temporary hack around the fact that * rules don't
|
|
# always get transformed into lists by NodeVisitor. We should fix that;
|
|
# it's surprising and requires writing lame branches like this.
|
|
return rule_map, (rule_map[rules[0].name]
|
|
if isinstance(rules, list) and rules else None)
|
|
|
|
|
|
class TokenRuleVisitor(RuleVisitor):
|
|
"""A visitor which builds expression trees meant to work on sequences of
|
|
pre-lexed tokens rather than strings"""
|
|
|
|
def visit_spaceless_literal(self, spaceless_literal, visited_children):
|
|
"""Turn a string literal into a ``TokenMatcher`` that matches
|
|
``Token`` objects by their ``type`` attributes."""
|
|
return TokenMatcher(evaluate_string(spaceless_literal.text))
|
|
|
|
def visit_regex(self, node, regex):
|
|
tilde, literal, flags, _ = regex
|
|
raise BadGrammar('Regexes do not make sense in TokenGrammars, since '
|
|
'TokenGrammars operate on pre-lexed tokens rather '
|
|
'than characters.')
|
|
|
|
|
|
# Bootstrap to level 1...
|
|
rule_grammar = BootstrappingGrammar(rule_syntax)
|
|
# ...and then to level 2. This establishes that the node tree of our rule
|
|
# syntax is built by the same machinery that will build trees of our users'
|
|
# grammars. And the correctness of that tree is tested, indirectly, in
|
|
# test_grammar.
|
|
rule_grammar = Grammar(rule_syntax)
|
|
|
|
|
|
# TODO: Teach Expression trees how to spit out Python representations of
|
|
# themselves. Then we can just paste that in above, and we won't have to
|
|
# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not
|
|
# so clean, because it would have to output multiple statements to get multiple
|
|
# refs to a single expression hooked up.]
|