CCR/.venv/lib/python3.12/site-packages/parsimonious/tests/test_grammar.py

681 lines
25 KiB
Python

# coding=utf-8
from sys import version_info
from unittest import TestCase
import pytest
from parsimonious.exceptions import BadGrammar, LeftRecursionError, ParseError, UndefinedLabel, VisitationError
from parsimonious.expressions import Literal, Lookahead, Regex, Sequence, TokenMatcher, is_callable
from parsimonious.grammar import rule_grammar, rule_syntax, RuleVisitor, Grammar, TokenGrammar, LazyReference
from parsimonious.nodes import Node
from parsimonious.utils import Token
class BootstrappingGrammarTests(TestCase):
"""Tests for the expressions in the grammar that parses the grammar
definition syntax"""
def test_quantifier(self):
text = '*'
quantifier = rule_grammar['quantifier']
self.assertEqual(quantifier.parse(text),
Node(quantifier, text, 0, 1, children=[
Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)]))
text = '?'
self.assertEqual(quantifier.parse(text),
Node(quantifier, text, 0, 1, children=[
Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)]))
text = '+'
self.assertEqual(quantifier.parse(text),
Node(quantifier, text, 0, 1, children=[
Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)]))
def test_spaceless_literal(self):
text = '"anything but quotes#$*&^"'
spaceless_literal = rule_grammar['spaceless_literal']
self.assertEqual(spaceless_literal.parse(text),
Node(spaceless_literal, text, 0, len(text), children=[
Node(spaceless_literal.members[0], text, 0, len(text))]))
text = r'''r"\""'''
self.assertEqual(spaceless_literal.parse(text),
Node(spaceless_literal, text, 0, 5, children=[
Node(spaceless_literal.members[0], text, 0, 5)]))
def test_regex(self):
text = '~"[a-zA-Z_][a-zA-Z_0-9]*"LI'
regex = rule_grammar['regex']
self.assertEqual(rule_grammar['regex'].parse(text),
Node(regex, text, 0, len(text), children=[
Node(Literal('~'), text, 0, 1),
Node(rule_grammar['spaceless_literal'], text, 1, 25, children=[
Node(rule_grammar['spaceless_literal'].members[0], text, 1, 25)]),
Node(regex.members[2], text, 25, 27),
Node(rule_grammar['_'], text, 27, 27)]))
def test_successes(self):
"""Make sure the PEG recognition grammar succeeds on various inputs."""
self.assertTrue(rule_grammar['label'].parse('_'))
self.assertTrue(rule_grammar['label'].parse('jeff'))
self.assertTrue(rule_grammar['label'].parse('_THIS_THING'))
self.assertTrue(rule_grammar['atom'].parse('some_label'))
self.assertTrue(rule_grammar['atom'].parse('"some literal"'))
self.assertTrue(rule_grammar['atom'].parse('~"some regex"i'))
self.assertTrue(rule_grammar['quantified'].parse('~"some regex"i*'))
self.assertTrue(rule_grammar['quantified'].parse('thing+'))
self.assertTrue(rule_grammar['quantified'].parse('"hi"?'))
self.assertTrue(rule_grammar['term'].parse('this'))
self.assertTrue(rule_grammar['term'].parse('that+'))
self.assertTrue(rule_grammar['sequence'].parse('this that? other'))
self.assertTrue(rule_grammar['ored'].parse('this / that+ / "other"'))
# + is higher precedence than &, so 'anded' should match the whole
# thing:
self.assertTrue(rule_grammar['lookahead_term'].parse('&this+'))
self.assertTrue(rule_grammar['expression'].parse('this'))
self.assertTrue(rule_grammar['expression'].parse('this? that other*'))
self.assertTrue(rule_grammar['expression'].parse('&this / that+ / "other"'))
self.assertTrue(rule_grammar['expression'].parse('this / that? / "other"+'))
self.assertTrue(rule_grammar['expression'].parse('this? that other*'))
self.assertTrue(rule_grammar['rule'].parse('this = that\r'))
self.assertTrue(rule_grammar['rule'].parse('this = the? that other* \t\r'))
self.assertTrue(rule_grammar['rule'].parse('the=~"hi*"\n'))
self.assertTrue(rule_grammar.parse('''
this = the? that other*
that = "thing"
the=~"hi*"
other = "ahoy hoy"
'''))
class RuleVisitorTests(TestCase):
"""Tests for ``RuleVisitor``
As I write these, Grammar is not yet fully implemented. Normally, there'd
be no reason to use ``RuleVisitor`` directly.
"""
def test_round_trip(self):
"""Test a simple round trip.
Parse a simple grammar, turn the parse tree into a map of expressions,
and use that to parse another piece of text.
Not everything was implemented yet, but it was a big milestone and a
proof of concept.
"""
tree = rule_grammar.parse('''number = ~"[0-9]+"\n''')
rules, default_rule = RuleVisitor().visit(tree)
text = '98'
self.assertEqual(default_rule.parse(text), Node(default_rule, text, 0, 2))
def test_undefined_rule(self):
"""Make sure we throw the right exception on undefined rules."""
tree = rule_grammar.parse('boy = howdy\n')
self.assertRaises(UndefinedLabel, RuleVisitor().visit, tree)
def test_optional(self):
tree = rule_grammar.parse('boy = "howdy"?\n')
rules, default_rule = RuleVisitor().visit(tree)
howdy = 'howdy'
# It should turn into a Node from the Optional and another from the
# Literal within.
self.assertEqual(default_rule.parse(howdy), Node(default_rule, howdy, 0, 5, children=[
Node(Literal("howdy"), howdy, 0, 5)]))
def function_rule(text, pos):
"""This is an example of a grammar rule implemented as a function, and is
provided as a test fixture."""
token = 'function'
return pos + len(token) if text[pos:].startswith(token) else None
class GrammarTests(TestCase):
"""Integration-test ``Grammar``: feed it a PEG and see if it works."""
def method_rule(self, text, pos):
"""This is an example of a grammar rule implemented as a method, and is
provided as a test fixture."""
token = 'method'
return pos + len(token) if text[pos:].startswith(token) else None
@staticmethod
def descriptor_rule(text, pos):
"""This is an example of a grammar rule implemented as a descriptor,
and is provided as a test fixture."""
token = 'descriptor'
return pos + len(token) if text[pos:].startswith(token) else None
rules = {"descriptor_rule": descriptor_rule}
def test_expressions_from_rules(self):
"""Test the ``Grammar`` base class's ability to compile an expression
tree from rules.
That the correct ``Expression`` tree is built is already tested in
``RuleGrammarTests``. This tests only that the ``Grammar`` base class's
``_expressions_from_rules`` works.
"""
greeting_grammar = Grammar('greeting = "hi" / "howdy"')
tree = greeting_grammar.parse('hi')
self.assertEqual(tree, Node(greeting_grammar['greeting'], 'hi', 0, 2, children=[
Node(Literal('hi'), 'hi', 0, 2)]))
def test_unicode(self):
"""Assert that a ``Grammar`` can convert into a string-formatted series
of rules."""
grammar = Grammar(r"""
bold_text = bold_open text bold_close
text = ~"[A-Z 0-9]*"i
bold_open = "(("
bold_close = "))"
""")
lines = str(grammar).splitlines()
self.assertEqual(lines[0], 'bold_text = bold_open text bold_close')
self.assertTrue("text = ~'[A-Z 0-9]*'i%s" % ('u' if version_info >= (3,) else '')
in lines)
self.assertTrue("bold_open = '(('" in lines)
self.assertTrue("bold_close = '))'" in lines)
self.assertEqual(len(lines), 4)
def test_match(self):
"""Make sure partial-matching (with pos) works."""
grammar = Grammar(r"""
bold_text = bold_open text bold_close
text = ~"[A-Z 0-9]*"i
bold_open = "(("
bold_close = "))"
""")
s = ' ((boo))yah'
self.assertEqual(grammar.match(s, pos=1), Node(grammar['bold_text'], s, 1, 8, children=[
Node(grammar['bold_open'], s, 1, 3),
Node(grammar['text'], s, 3, 6),
Node(grammar['bold_close'], s, 6, 8)]))
def test_bad_grammar(self):
"""Constructing a Grammar with bad rules should raise ParseError."""
self.assertRaises(ParseError, Grammar, 'just a bunch of junk')
def test_comments(self):
"""Test tolerance of comments and blank lines in and around rules."""
grammar = Grammar(r"""# This is a grammar.
# It sure is.
bold_text = stars text stars # nice
text = ~"[A-Z 0-9]*"i #dude
stars = "**"
# Pretty good
#Oh yeah.#""") # Make sure a comment doesn't need a
# \n or \r to end.
self.assertEqual(list(sorted(str(grammar).splitlines())),
['''bold_text = stars text stars''',
# TODO: Unicode flag is on by default in Python 3. I wonder if we
# should turn it on all the time in Parsimonious.
"""stars = '**'""",
'''text = ~'[A-Z 0-9]*'i%s''' % ('u' if version_info >= (3,)
else '')])
def test_multi_line(self):
"""Make sure we tolerate all sorts of crazy line breaks and comments in
the middle of rules."""
grammar = Grammar("""
bold_text = bold_open # commenty comment
text # more comment
bold_close
text = ~"[A-Z 0-9]*"i
bold_open = "((" bold_close = "))"
""")
self.assertTrue(grammar.parse('((booyah))') is not None)
def test_not(self):
"""Make sure "not" predicates get parsed and work properly."""
grammar = Grammar(r'''not_arp = !"arp" ~"[a-z]+"''')
self.assertRaises(ParseError, grammar.parse, 'arp')
self.assertTrue(grammar.parse('argle') is not None)
def test_lookahead(self):
grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''')
self.assertRaises(ParseError, grammar.parse, 'burp')
s = 'arp'
self.assertEqual(grammar.parse('arp'), Node(grammar['starts_with_a'], s, 0, 3, children=[
Node(Lookahead(Literal('a')), s, 0, 0),
Node(Regex(r'[a-z]+'), s, 0, 3)]))
def test_parens(self):
grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''')
# Make sure it's not as if the parens aren't there:
self.assertRaises(ParseError, grammar.parse, 'chitty bangbang')
s = 'chitty bang bang'
self.assertEqual(str(grammar.parse(s)),
"""<Node called "sequence" matching "chitty bang bang">
<Node matching "chitty">
<Node matching " bang bang">
<Node matching " bang">
<Node matching " ">
<Node matching "bang">
<Node matching " bang">
<Node matching " ">
<Node matching "bang">""")
def test_resolve_refs_order(self):
"""Smoke-test a circumstance where lazy references don't get resolved."""
grammar = Grammar("""
expression = "(" terms ")"
terms = term+
term = number
number = ~r"[0-9]+"
""")
grammar.parse('(34)')
def test_resolve_refs_completeness(self):
"""Smoke-test another circumstance where lazy references don't get resolved."""
grammar = Grammar(r"""
block = "{" _ item* "}" _
# An item is an element of a block.
item = number / word / block / paren
# Parens are for delimiting subexpressions.
paren = "(" _ item* ")" _
# Words are barewords, unquoted things, other than literals, that can live
# in lists. We may renege on some of these chars later, especially ".". We
# may add Unicode.
word = spaceless_word _
spaceless_word = ~r"[-a-z`~!@#$%^&*_+=|\\;<>,.?][-a-z0-9`~!@#$%^&*_+=|\\;<>,.?]*"i
number = ~r"[0-9]+" _ # There are decimals and strings and other stuff back on the "parsing" branch, once you get this working.
_ = meaninglessness*
meaninglessness = whitespace
whitespace = ~r"\s+"
""")
grammar.parse('{log (add 3 to 5)}')
def test_infinite_loop(self):
"""Smoke-test a grammar that was causing infinite loops while building.
This was going awry because the "int" rule was never getting marked as
resolved, so it would just keep trying to resolve it over and over.
"""
Grammar("""
digits = digit+
int = digits
digit = ~"[0-9]"
number = int
main = number
""")
def test_circular_toplevel_reference(self):
with pytest.raises(VisitationError):
Grammar("""
foo = bar
bar = foo
""")
with pytest.raises(VisitationError):
Grammar("""
foo = foo
bar = foo
""")
with pytest.raises(VisitationError):
Grammar("""
foo = bar
bar = baz
baz = foo
""")
def test_right_recursive(self):
"""Right-recursive refs should resolve."""
grammar = Grammar("""
digits = digit digits?
digit = ~r"[0-9]"
""")
self.assertTrue(grammar.parse('12') is not None)
def test_badly_circular(self):
"""Uselessly circular references should be detected by the grammar
compiler."""
self.skipTest('We have yet to make the grammar compiler detect these.')
Grammar("""
foo = bar
bar = foo
""")
def test_parens_with_leading_whitespace(self):
"""Make sure a parenthesized expression is allowed to have leading
whitespace when nested directly inside another."""
Grammar("""foo = ( ("c") )""").parse('c')
def test_single_quoted_literals(self):
Grammar("""foo = 'a' '"'""").parse('a"')
def test_simple_custom_rules(self):
"""Run 2-arg custom-coded rules through their paces."""
grammar = Grammar("""
bracketed_digit = start digit end
start = '['
end = ']'""",
digit=lambda text, pos:
(pos + 1) if text[pos].isdigit() else None)
s = '[6]'
self.assertEqual(grammar.parse(s),
Node(grammar['bracketed_digit'], s, 0, 3, children=[
Node(grammar['start'], s, 0, 1),
Node(grammar['digit'], s, 1, 2),
Node(grammar['end'], s, 2, 3)]))
def test_complex_custom_rules(self):
"""Run 5-arg custom rules through their paces.
Incidentally tests returning an actual Node from the custom rule.
"""
grammar = Grammar("""
bracketed_digit = start digit end
start = '['
end = ']'
real_digit = '6'""",
# In this particular implementation of the digit rule, no node is
# generated for `digit`; it falls right through to `real_digit`.
# I'm not sure if this could lead to problems; I can't think of
# any, but it's probably not a great idea.
digit=lambda text, pos, cache, error, grammar:
grammar['real_digit'].match_core(text, pos, cache, error))
s = '[6]'
self.assertEqual(grammar.parse(s),
Node(grammar['bracketed_digit'], s, 0, 3, children=[
Node(grammar['start'], s, 0, 1),
Node(grammar['real_digit'], s, 1, 2),
Node(grammar['end'], s, 2, 3)]))
def test_lazy_custom_rules(self):
"""Make sure LazyReferences manually shoved into custom rules are
resolved.
Incidentally test passing full-on Expressions as custom rules and
having a custom rule as the default one.
"""
grammar = Grammar("""
four = '4'
five = '5'""",
forty_five=Sequence(LazyReference('four'),
LazyReference('five'),
name='forty_five')).default('forty_five')
s = '45'
self.assertEqual(grammar.parse(s),
Node(grammar['forty_five'], s, 0, 2, children=[
Node(grammar['four'], s, 0, 1),
Node(grammar['five'], s, 1, 2)]))
def test_unconnected_custom_rules(self):
"""Make sure custom rules that aren't hooked to any other rules still
get included in the grammar and that lone ones get set as the
default.
Incidentally test Grammar's `rules` default arg.
"""
grammar = Grammar(one_char=lambda text, pos: pos + 1).default('one_char')
s = '4'
self.assertEqual(grammar.parse(s),
Node(grammar['one_char'], s, 0, 1))
def test_callability_of_routines(self):
self.assertTrue(is_callable(function_rule))
self.assertTrue(is_callable(self.method_rule))
self.assertTrue(is_callable(self.rules['descriptor_rule']))
def test_callability_custom_rules(self):
"""Confirms that functions, methods and method descriptors can all be
used to supply custom grammar rules.
"""
grammar = Grammar("""
default = function method descriptor
""",
function=function_rule,
method=self.method_rule,
descriptor=self.rules['descriptor_rule'],
)
result = grammar.parse('functionmethoddescriptor')
rule_names = [node.expr.name for node in result.children]
self.assertEqual(rule_names, ['function', 'method', 'descriptor'])
def test_lazy_default_rule(self):
"""Make sure we get an actual rule set as our default rule, even when
the first rule has forward references and is thus a LazyReference at
some point during grammar compilation.
"""
grammar = Grammar(r"""
styled_text = text
text = "hi"
""")
self.assertEqual(grammar.parse('hi'), Node(grammar['text'], 'hi', 0, 2))
def test_immutable_grammar(self):
"""Make sure that a Grammar is immutable after being created."""
grammar = Grammar(r"""
foo = 'bar'
""")
def mod_grammar(grammar):
grammar['foo'] = 1
self.assertRaises(TypeError, mod_grammar, [grammar])
def mod_grammar(grammar):
new_grammar = Grammar(r"""
baz = 'biff'
""")
grammar.update(new_grammar)
self.assertRaises(AttributeError, mod_grammar, [grammar])
def test_repr(self):
self.assertTrue(repr(Grammar(r'foo = "a"')))
def test_rule_ordering_is_preserved(self):
grammar = Grammar('\n'.join('r%s = "something"' % i for i in range(100)))
self.assertEqual(
list(grammar.keys()),
['r%s' % i for i in range(100)])
def test_rule_ordering_is_preserved_on_shallow_copies(self):
grammar = Grammar('\n'.join('r%s = "something"' % i for i in range(100)))._copy()
self.assertEqual(
list(grammar.keys()),
['r%s' % i for i in range(100)])
def test_repetitions(self):
grammar = Grammar(r'''
left_missing = "a"{,5}
right_missing = "a"{5,}
exact = "a"{5}
range = "a"{2,5}
optional = "a"?
plus = "a"+
star = "a"*
''')
should_parse = [
("left_missing", ["a" * i for i in range(6)]),
("right_missing", ["a" * i for i in range(5, 8)]),
("exact", ["a" * 5]),
("range", ["a" * i for i in range(2, 6)]),
("optional", ["", "a"]),
("plus", ["a", "aa"]),
("star", ["", "a", "aa"]),
]
for rule, examples in should_parse:
for example in examples:
assert grammar[rule].parse(example)
should_not_parse = [
("left_missing", ["a" * 6]),
("right_missing", ["a" * i for i in range(5)]),
("exact", ["a" * i for i in list(range(5)) + list(range(6, 10))]),
("range", ["a" * i for i in list(range(2)) + list(range(6, 10))]),
("optional", ["aa"]),
("plus", [""]),
("star", ["b"]),
]
for rule, examples in should_not_parse:
for example in examples:
with pytest.raises(ParseError):
grammar[rule].parse(example)
def test_equal(self):
grammar_def = (r"""
x = y / z / ""
y = "y" x
z = "z" x
""")
assert Grammar(grammar_def) == Grammar(grammar_def)
self.assertEqual(Grammar(rule_syntax), Grammar(rule_syntax))
self.assertNotEqual(Grammar('expr = ~"[a-z]{1,3}"'), Grammar('expr = ~"[a-z]{2,3}"'))
self.assertNotEqual(Grammar('expr = ~"[a-z]{1,3}"'), Grammar('expr = ~"[a-z]{1,4}"'))
self.assertNotEqual(Grammar('expr = &"a"'), Grammar('expr = !"a"'))
class TokenGrammarTests(TestCase):
"""Tests for the TokenGrammar class and associated machinery"""
def test_parse_success(self):
"""Token literals should work."""
s = [Token('token1'), Token('token2')]
grammar = TokenGrammar("""
foo = token1 "token2"
token1 = "token1"
""")
self.assertEqual(grammar.parse(s),
Node(grammar['foo'], s, 0, 2, children=[
Node(grammar['token1'], s, 0, 1),
Node(TokenMatcher('token2'), s, 1, 2)]))
def test_parse_failure(self):
"""Parse failures should work normally with token literals."""
grammar = TokenGrammar("""
foo = "token1" "token2"
""")
with pytest.raises(ParseError) as e:
grammar.parse([Token('tokenBOO'), Token('token2')])
assert "Rule 'foo' didn't match at" in str(e.value)
def test_token_repr(self):
t = Token('💣')
self.assertTrue(isinstance(t.__repr__(), str))
self.assertEqual('<Token "💣">', t.__repr__())
def test_token_star_plus_expressions(self):
a = Token("a")
b = Token("b")
grammar = TokenGrammar("""
foo = "a"*
bar = "a"+
""")
assert grammar["foo"].parse([]) is not None
assert grammar["foo"].parse([a]) is not None
assert grammar["foo"].parse([a, a]) is not None
with pytest.raises(ParseError):
grammar["foo"].parse([a, b])
with pytest.raises(ParseError):
grammar["foo"].parse([b])
assert grammar["bar"].parse([a]) is not None
with pytest.raises(ParseError):
grammar["bar"].parse([a, b])
with pytest.raises(ParseError):
grammar["bar"].parse([b])
def test_precedence_of_string_modifiers():
# r"strings", etc. should be parsed as a single literal, not r followed
# by a string literal.
g = Grammar(r"""
escaped_bell = r"\b"
r = "irrelevant"
""")
assert isinstance(g["escaped_bell"], Literal)
assert g["escaped_bell"].literal == "\\b"
with pytest.raises(ParseError):
g.parse("irrelevant\b")
g2 = Grammar(r"""
escaped_bell = r"\b"
""")
assert g2.parse("\\b")
def test_binary_grammar():
g = Grammar(r"""
file = header body terminator
header = b"\xFF" length b"~"
length = ~rb"\d+"
body = ~b"[^\xFF]*"
terminator = b"\xFF"
""")
length = 22
assert g.parse(b"\xff22~" + (b"a" * 22) + b"\xff") is not None
def test_inconsistent_string_types_in_grammar():
with pytest.raises(VisitationError) as e:
Grammar(r"""
foo = b"foo"
bar = "bar"
""")
assert e.value.original_class is BadGrammar
with pytest.raises(VisitationError) as e:
Grammar(r"""
foo = ~b"foo"
bar = "bar"
""")
assert e.value.original_class is BadGrammar
# The following should parse without errors because they use the same
# string types:
Grammar(r"""
foo = b"foo"
bar = b"bar"
""")
Grammar(r"""
foo = "foo"
bar = "bar"
""")
def test_left_associative():
# Regression test for https://github.com/erikrose/parsimonious/issues/209
language_grammar = r"""
expression = operator_expression / non_operator_expression
non_operator_expression = number_expression
operator_expression = expression "+" non_operator_expression
number_expression = ~"[0-9]+"
"""
grammar = Grammar(language_grammar)
with pytest.raises(LeftRecursionError) as e:
grammar["operator_expression"].parse("1+2")
assert "Parsimonious is a packrat parser, so it can't handle left recursion." in str(e.value)