# coding=utf-8 from sys import version_info from unittest import TestCase import pytest from parsimonious.exceptions import BadGrammar, LeftRecursionError, ParseError, UndefinedLabel, VisitationError from parsimonious.expressions import Literal, Lookahead, Regex, Sequence, TokenMatcher, is_callable from parsimonious.grammar import rule_grammar, rule_syntax, RuleVisitor, Grammar, TokenGrammar, LazyReference from parsimonious.nodes import Node from parsimonious.utils import Token class BootstrappingGrammarTests(TestCase): """Tests for the expressions in the grammar that parses the grammar definition syntax""" def test_quantifier(self): text = '*' quantifier = rule_grammar['quantifier'] self.assertEqual(quantifier.parse(text), Node(quantifier, text, 0, 1, children=[ Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)])) text = '?' self.assertEqual(quantifier.parse(text), Node(quantifier, text, 0, 1, children=[ Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)])) text = '+' self.assertEqual(quantifier.parse(text), Node(quantifier, text, 0, 1, children=[ Node(quantifier.members[0], text, 0, 1), Node(rule_grammar['_'], text, 1, 1)])) def test_spaceless_literal(self): text = '"anything but quotes#$*&^"' spaceless_literal = rule_grammar['spaceless_literal'] self.assertEqual(spaceless_literal.parse(text), Node(spaceless_literal, text, 0, len(text), children=[ Node(spaceless_literal.members[0], text, 0, len(text))])) text = r'''r"\""''' self.assertEqual(spaceless_literal.parse(text), Node(spaceless_literal, text, 0, 5, children=[ Node(spaceless_literal.members[0], text, 0, 5)])) def test_regex(self): text = '~"[a-zA-Z_][a-zA-Z_0-9]*"LI' regex = rule_grammar['regex'] self.assertEqual(rule_grammar['regex'].parse(text), Node(regex, text, 0, len(text), children=[ Node(Literal('~'), text, 0, 1), Node(rule_grammar['spaceless_literal'], text, 1, 25, children=[ Node(rule_grammar['spaceless_literal'].members[0], text, 1, 25)]), Node(regex.members[2], text, 25, 27), Node(rule_grammar['_'], text, 27, 27)])) def test_successes(self): """Make sure the PEG recognition grammar succeeds on various inputs.""" self.assertTrue(rule_grammar['label'].parse('_')) self.assertTrue(rule_grammar['label'].parse('jeff')) self.assertTrue(rule_grammar['label'].parse('_THIS_THING')) self.assertTrue(rule_grammar['atom'].parse('some_label')) self.assertTrue(rule_grammar['atom'].parse('"some literal"')) self.assertTrue(rule_grammar['atom'].parse('~"some regex"i')) self.assertTrue(rule_grammar['quantified'].parse('~"some regex"i*')) self.assertTrue(rule_grammar['quantified'].parse('thing+')) self.assertTrue(rule_grammar['quantified'].parse('"hi"?')) self.assertTrue(rule_grammar['term'].parse('this')) self.assertTrue(rule_grammar['term'].parse('that+')) self.assertTrue(rule_grammar['sequence'].parse('this that? other')) self.assertTrue(rule_grammar['ored'].parse('this / that+ / "other"')) # + is higher precedence than &, so 'anded' should match the whole # thing: self.assertTrue(rule_grammar['lookahead_term'].parse('&this+')) self.assertTrue(rule_grammar['expression'].parse('this')) self.assertTrue(rule_grammar['expression'].parse('this? that other*')) self.assertTrue(rule_grammar['expression'].parse('&this / that+ / "other"')) self.assertTrue(rule_grammar['expression'].parse('this / that? / "other"+')) self.assertTrue(rule_grammar['expression'].parse('this? that other*')) self.assertTrue(rule_grammar['rule'].parse('this = that\r')) self.assertTrue(rule_grammar['rule'].parse('this = the? that other* \t\r')) self.assertTrue(rule_grammar['rule'].parse('the=~"hi*"\n')) self.assertTrue(rule_grammar.parse(''' this = the? that other* that = "thing" the=~"hi*" other = "ahoy hoy" ''')) class RuleVisitorTests(TestCase): """Tests for ``RuleVisitor`` As I write these, Grammar is not yet fully implemented. Normally, there'd be no reason to use ``RuleVisitor`` directly. """ def test_round_trip(self): """Test a simple round trip. Parse a simple grammar, turn the parse tree into a map of expressions, and use that to parse another piece of text. Not everything was implemented yet, but it was a big milestone and a proof of concept. """ tree = rule_grammar.parse('''number = ~"[0-9]+"\n''') rules, default_rule = RuleVisitor().visit(tree) text = '98' self.assertEqual(default_rule.parse(text), Node(default_rule, text, 0, 2)) def test_undefined_rule(self): """Make sure we throw the right exception on undefined rules.""" tree = rule_grammar.parse('boy = howdy\n') self.assertRaises(UndefinedLabel, RuleVisitor().visit, tree) def test_optional(self): tree = rule_grammar.parse('boy = "howdy"?\n') rules, default_rule = RuleVisitor().visit(tree) howdy = 'howdy' # It should turn into a Node from the Optional and another from the # Literal within. self.assertEqual(default_rule.parse(howdy), Node(default_rule, howdy, 0, 5, children=[ Node(Literal("howdy"), howdy, 0, 5)])) def function_rule(text, pos): """This is an example of a grammar rule implemented as a function, and is provided as a test fixture.""" token = 'function' return pos + len(token) if text[pos:].startswith(token) else None class GrammarTests(TestCase): """Integration-test ``Grammar``: feed it a PEG and see if it works.""" def method_rule(self, text, pos): """This is an example of a grammar rule implemented as a method, and is provided as a test fixture.""" token = 'method' return pos + len(token) if text[pos:].startswith(token) else None @staticmethod def descriptor_rule(text, pos): """This is an example of a grammar rule implemented as a descriptor, and is provided as a test fixture.""" token = 'descriptor' return pos + len(token) if text[pos:].startswith(token) else None rules = {"descriptor_rule": descriptor_rule} def test_expressions_from_rules(self): """Test the ``Grammar`` base class's ability to compile an expression tree from rules. That the correct ``Expression`` tree is built is already tested in ``RuleGrammarTests``. This tests only that the ``Grammar`` base class's ``_expressions_from_rules`` works. """ greeting_grammar = Grammar('greeting = "hi" / "howdy"') tree = greeting_grammar.parse('hi') self.assertEqual(tree, Node(greeting_grammar['greeting'], 'hi', 0, 2, children=[ Node(Literal('hi'), 'hi', 0, 2)])) def test_unicode(self): """Assert that a ``Grammar`` can convert into a string-formatted series of rules.""" grammar = Grammar(r""" bold_text = bold_open text bold_close text = ~"[A-Z 0-9]*"i bold_open = "((" bold_close = "))" """) lines = str(grammar).splitlines() self.assertEqual(lines[0], 'bold_text = bold_open text bold_close') self.assertTrue("text = ~'[A-Z 0-9]*'i%s" % ('u' if version_info >= (3,) else '') in lines) self.assertTrue("bold_open = '(('" in lines) self.assertTrue("bold_close = '))'" in lines) self.assertEqual(len(lines), 4) def test_match(self): """Make sure partial-matching (with pos) works.""" grammar = Grammar(r""" bold_text = bold_open text bold_close text = ~"[A-Z 0-9]*"i bold_open = "((" bold_close = "))" """) s = ' ((boo))yah' self.assertEqual(grammar.match(s, pos=1), Node(grammar['bold_text'], s, 1, 8, children=[ Node(grammar['bold_open'], s, 1, 3), Node(grammar['text'], s, 3, 6), Node(grammar['bold_close'], s, 6, 8)])) def test_bad_grammar(self): """Constructing a Grammar with bad rules should raise ParseError.""" self.assertRaises(ParseError, Grammar, 'just a bunch of junk') def test_comments(self): """Test tolerance of comments and blank lines in and around rules.""" grammar = Grammar(r"""# This is a grammar. # It sure is. bold_text = stars text stars # nice text = ~"[A-Z 0-9]*"i #dude stars = "**" # Pretty good #Oh yeah.#""") # Make sure a comment doesn't need a # \n or \r to end. self.assertEqual(list(sorted(str(grammar).splitlines())), ['''bold_text = stars text stars''', # TODO: Unicode flag is on by default in Python 3. I wonder if we # should turn it on all the time in Parsimonious. """stars = '**'""", '''text = ~'[A-Z 0-9]*'i%s''' % ('u' if version_info >= (3,) else '')]) def test_multi_line(self): """Make sure we tolerate all sorts of crazy line breaks and comments in the middle of rules.""" grammar = Grammar(""" bold_text = bold_open # commenty comment text # more comment bold_close text = ~"[A-Z 0-9]*"i bold_open = "((" bold_close = "))" """) self.assertTrue(grammar.parse('((booyah))') is not None) def test_not(self): """Make sure "not" predicates get parsed and work properly.""" grammar = Grammar(r'''not_arp = !"arp" ~"[a-z]+"''') self.assertRaises(ParseError, grammar.parse, 'arp') self.assertTrue(grammar.parse('argle') is not None) def test_lookahead(self): grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''') self.assertRaises(ParseError, grammar.parse, 'burp') s = 'arp' self.assertEqual(grammar.parse('arp'), Node(grammar['starts_with_a'], s, 0, 3, children=[ Node(Lookahead(Literal('a')), s, 0, 0), Node(Regex(r'[a-z]+'), s, 0, 3)])) def test_parens(self): grammar = Grammar(r'''sequence = "chitty" (" " "bang")+''') # Make sure it's not as if the parens aren't there: self.assertRaises(ParseError, grammar.parse, 'chitty bangbang') s = 'chitty bang bang' self.assertEqual(str(grammar.parse(s)), """ """) def test_resolve_refs_order(self): """Smoke-test a circumstance where lazy references don't get resolved.""" grammar = Grammar(""" expression = "(" terms ")" terms = term+ term = number number = ~r"[0-9]+" """) grammar.parse('(34)') def test_resolve_refs_completeness(self): """Smoke-test another circumstance where lazy references don't get resolved.""" grammar = Grammar(r""" block = "{" _ item* "}" _ # An item is an element of a block. item = number / word / block / paren # Parens are for delimiting subexpressions. paren = "(" _ item* ")" _ # Words are barewords, unquoted things, other than literals, that can live # in lists. We may renege on some of these chars later, especially ".". We # may add Unicode. word = spaceless_word _ spaceless_word = ~r"[-a-z`~!@#$%^&*_+=|\\;<>,.?][-a-z0-9`~!@#$%^&*_+=|\\;<>,.?]*"i number = ~r"[0-9]+" _ # There are decimals and strings and other stuff back on the "parsing" branch, once you get this working. _ = meaninglessness* meaninglessness = whitespace whitespace = ~r"\s+" """) grammar.parse('{log (add 3 to 5)}') def test_infinite_loop(self): """Smoke-test a grammar that was causing infinite loops while building. This was going awry because the "int" rule was never getting marked as resolved, so it would just keep trying to resolve it over and over. """ Grammar(""" digits = digit+ int = digits digit = ~"[0-9]" number = int main = number """) def test_circular_toplevel_reference(self): with pytest.raises(VisitationError): Grammar(""" foo = bar bar = foo """) with pytest.raises(VisitationError): Grammar(""" foo = foo bar = foo """) with pytest.raises(VisitationError): Grammar(""" foo = bar bar = baz baz = foo """) def test_right_recursive(self): """Right-recursive refs should resolve.""" grammar = Grammar(""" digits = digit digits? digit = ~r"[0-9]" """) self.assertTrue(grammar.parse('12') is not None) def test_badly_circular(self): """Uselessly circular references should be detected by the grammar compiler.""" self.skipTest('We have yet to make the grammar compiler detect these.') Grammar(""" foo = bar bar = foo """) def test_parens_with_leading_whitespace(self): """Make sure a parenthesized expression is allowed to have leading whitespace when nested directly inside another.""" Grammar("""foo = ( ("c") )""").parse('c') def test_single_quoted_literals(self): Grammar("""foo = 'a' '"'""").parse('a"') def test_simple_custom_rules(self): """Run 2-arg custom-coded rules through their paces.""" grammar = Grammar(""" bracketed_digit = start digit end start = '[' end = ']'""", digit=lambda text, pos: (pos + 1) if text[pos].isdigit() else None) s = '[6]' self.assertEqual(grammar.parse(s), Node(grammar['bracketed_digit'], s, 0, 3, children=[ Node(grammar['start'], s, 0, 1), Node(grammar['digit'], s, 1, 2), Node(grammar['end'], s, 2, 3)])) def test_complex_custom_rules(self): """Run 5-arg custom rules through their paces. Incidentally tests returning an actual Node from the custom rule. """ grammar = Grammar(""" bracketed_digit = start digit end start = '[' end = ']' real_digit = '6'""", # In this particular implementation of the digit rule, no node is # generated for `digit`; it falls right through to `real_digit`. # I'm not sure if this could lead to problems; I can't think of # any, but it's probably not a great idea. digit=lambda text, pos, cache, error, grammar: grammar['real_digit'].match_core(text, pos, cache, error)) s = '[6]' self.assertEqual(grammar.parse(s), Node(grammar['bracketed_digit'], s, 0, 3, children=[ Node(grammar['start'], s, 0, 1), Node(grammar['real_digit'], s, 1, 2), Node(grammar['end'], s, 2, 3)])) def test_lazy_custom_rules(self): """Make sure LazyReferences manually shoved into custom rules are resolved. Incidentally test passing full-on Expressions as custom rules and having a custom rule as the default one. """ grammar = Grammar(""" four = '4' five = '5'""", forty_five=Sequence(LazyReference('four'), LazyReference('five'), name='forty_five')).default('forty_five') s = '45' self.assertEqual(grammar.parse(s), Node(grammar['forty_five'], s, 0, 2, children=[ Node(grammar['four'], s, 0, 1), Node(grammar['five'], s, 1, 2)])) def test_unconnected_custom_rules(self): """Make sure custom rules that aren't hooked to any other rules still get included in the grammar and that lone ones get set as the default. Incidentally test Grammar's `rules` default arg. """ grammar = Grammar(one_char=lambda text, pos: pos + 1).default('one_char') s = '4' self.assertEqual(grammar.parse(s), Node(grammar['one_char'], s, 0, 1)) def test_callability_of_routines(self): self.assertTrue(is_callable(function_rule)) self.assertTrue(is_callable(self.method_rule)) self.assertTrue(is_callable(self.rules['descriptor_rule'])) def test_callability_custom_rules(self): """Confirms that functions, methods and method descriptors can all be used to supply custom grammar rules. """ grammar = Grammar(""" default = function method descriptor """, function=function_rule, method=self.method_rule, descriptor=self.rules['descriptor_rule'], ) result = grammar.parse('functionmethoddescriptor') rule_names = [node.expr.name for node in result.children] self.assertEqual(rule_names, ['function', 'method', 'descriptor']) def test_lazy_default_rule(self): """Make sure we get an actual rule set as our default rule, even when the first rule has forward references and is thus a LazyReference at some point during grammar compilation. """ grammar = Grammar(r""" styled_text = text text = "hi" """) self.assertEqual(grammar.parse('hi'), Node(grammar['text'], 'hi', 0, 2)) def test_immutable_grammar(self): """Make sure that a Grammar is immutable after being created.""" grammar = Grammar(r""" foo = 'bar' """) def mod_grammar(grammar): grammar['foo'] = 1 self.assertRaises(TypeError, mod_grammar, [grammar]) def mod_grammar(grammar): new_grammar = Grammar(r""" baz = 'biff' """) grammar.update(new_grammar) self.assertRaises(AttributeError, mod_grammar, [grammar]) def test_repr(self): self.assertTrue(repr(Grammar(r'foo = "a"'))) def test_rule_ordering_is_preserved(self): grammar = Grammar('\n'.join('r%s = "something"' % i for i in range(100))) self.assertEqual( list(grammar.keys()), ['r%s' % i for i in range(100)]) def test_rule_ordering_is_preserved_on_shallow_copies(self): grammar = Grammar('\n'.join('r%s = "something"' % i for i in range(100)))._copy() self.assertEqual( list(grammar.keys()), ['r%s' % i for i in range(100)]) def test_repetitions(self): grammar = Grammar(r''' left_missing = "a"{,5} right_missing = "a"{5,} exact = "a"{5} range = "a"{2,5} optional = "a"? plus = "a"+ star = "a"* ''') should_parse = [ ("left_missing", ["a" * i for i in range(6)]), ("right_missing", ["a" * i for i in range(5, 8)]), ("exact", ["a" * 5]), ("range", ["a" * i for i in range(2, 6)]), ("optional", ["", "a"]), ("plus", ["a", "aa"]), ("star", ["", "a", "aa"]), ] for rule, examples in should_parse: for example in examples: assert grammar[rule].parse(example) should_not_parse = [ ("left_missing", ["a" * 6]), ("right_missing", ["a" * i for i in range(5)]), ("exact", ["a" * i for i in list(range(5)) + list(range(6, 10))]), ("range", ["a" * i for i in list(range(2)) + list(range(6, 10))]), ("optional", ["aa"]), ("plus", [""]), ("star", ["b"]), ] for rule, examples in should_not_parse: for example in examples: with pytest.raises(ParseError): grammar[rule].parse(example) def test_equal(self): grammar_def = (r""" x = y / z / "" y = "y" x z = "z" x """) assert Grammar(grammar_def) == Grammar(grammar_def) self.assertEqual(Grammar(rule_syntax), Grammar(rule_syntax)) self.assertNotEqual(Grammar('expr = ~"[a-z]{1,3}"'), Grammar('expr = ~"[a-z]{2,3}"')) self.assertNotEqual(Grammar('expr = ~"[a-z]{1,3}"'), Grammar('expr = ~"[a-z]{1,4}"')) self.assertNotEqual(Grammar('expr = &"a"'), Grammar('expr = !"a"')) class TokenGrammarTests(TestCase): """Tests for the TokenGrammar class and associated machinery""" def test_parse_success(self): """Token literals should work.""" s = [Token('token1'), Token('token2')] grammar = TokenGrammar(""" foo = token1 "token2" token1 = "token1" """) self.assertEqual(grammar.parse(s), Node(grammar['foo'], s, 0, 2, children=[ Node(grammar['token1'], s, 0, 1), Node(TokenMatcher('token2'), s, 1, 2)])) def test_parse_failure(self): """Parse failures should work normally with token literals.""" grammar = TokenGrammar(""" foo = "token1" "token2" """) with pytest.raises(ParseError) as e: grammar.parse([Token('tokenBOO'), Token('token2')]) assert "Rule 'foo' didn't match at" in str(e.value) def test_token_repr(self): t = Token('💣') self.assertTrue(isinstance(t.__repr__(), str)) self.assertEqual('', t.__repr__()) def test_token_star_plus_expressions(self): a = Token("a") b = Token("b") grammar = TokenGrammar(""" foo = "a"* bar = "a"+ """) assert grammar["foo"].parse([]) is not None assert grammar["foo"].parse([a]) is not None assert grammar["foo"].parse([a, a]) is not None with pytest.raises(ParseError): grammar["foo"].parse([a, b]) with pytest.raises(ParseError): grammar["foo"].parse([b]) assert grammar["bar"].parse([a]) is not None with pytest.raises(ParseError): grammar["bar"].parse([a, b]) with pytest.raises(ParseError): grammar["bar"].parse([b]) def test_precedence_of_string_modifiers(): # r"strings", etc. should be parsed as a single literal, not r followed # by a string literal. g = Grammar(r""" escaped_bell = r"\b" r = "irrelevant" """) assert isinstance(g["escaped_bell"], Literal) assert g["escaped_bell"].literal == "\\b" with pytest.raises(ParseError): g.parse("irrelevant\b") g2 = Grammar(r""" escaped_bell = r"\b" """) assert g2.parse("\\b") def test_binary_grammar(): g = Grammar(r""" file = header body terminator header = b"\xFF" length b"~" length = ~rb"\d+" body = ~b"[^\xFF]*" terminator = b"\xFF" """) length = 22 assert g.parse(b"\xff22~" + (b"a" * 22) + b"\xff") is not None def test_inconsistent_string_types_in_grammar(): with pytest.raises(VisitationError) as e: Grammar(r""" foo = b"foo" bar = "bar" """) assert e.value.original_class is BadGrammar with pytest.raises(VisitationError) as e: Grammar(r""" foo = ~b"foo" bar = "bar" """) assert e.value.original_class is BadGrammar # The following should parse without errors because they use the same # string types: Grammar(r""" foo = b"foo" bar = b"bar" """) Grammar(r""" foo = "foo" bar = "bar" """) def test_left_associative(): # Regression test for https://github.com/erikrose/parsimonious/issues/209 language_grammar = r""" expression = operator_expression / non_operator_expression non_operator_expression = number_expression operator_expression = expression "+" non_operator_expression number_expression = ~"[0-9]+" """ grammar = Grammar(language_grammar) with pytest.raises(LeftRecursionError) as e: grammar["operator_expression"].parse("1+2") assert "Parsimonious is a packrat parser, so it can't handle left recursion." in str(e.value)