SLY python can't parse simple tokens-CodePudding

I'm working on making a simple interpreted programming language using SLY to generate a AST which I will interpret without using SLY.

Currently I have been able to generate all my tokens and giving them to the parser, but it can't recognize any rule, only empty ones.

Lexer:

from .sly.lex import Lexer

class ALexer(Lexer):
    tokens = { ID, NUMBER, STRING, BOOL, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN, COMMA, NL }
    ignore = ' \t'

# Tokens
@_(r'\d [[.]?\d*[f]?]?')
def NUMBER(self, t):
    endswithF = t.value.endswith('f')
    isfloat = endswithF or t.value.find('.') != -1
    t.value = float(t.value[:-1] if endswithF else t.value) if isfloat else int(t.value)   # Convert to a numeric value
    return t

ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['true'] = BOOL
ID['false'] = BOOL

@_(r'".*"')
def STRING(self, t):
    t.value = t.value[1:-1]
    return t

# Special symbols
PLUS = r'\ '
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
COMMA = r','

@_(r'true', r'false')
def BOOL(self, t):
    t.value = t.value == 'true'
    return t

@_(r'\n')
def NL(self, t):
    self.lineno  = 1

def error(self, t):
    print("Illegal character '%s'" % t.value[0])
    self.index  = 1

Util classes:

from enum import Enum

class SupportedOp(str, Enum):
    CONSTANT = "CONSTANT",
    VARIABLE = "VARIABLE",
    ARGS_LIST = "ARGS_LIST",
    FUNC_CALL = "FUNC_CALL",
    STATEMENT = "STATEMENT",
    STATEMENT_LIST = "STATEMENT_LIST",
    PROGRAM = "PROGRAM",
    SUM = ' ',
    SUB = '-',
    DIV = '/',
    MUL = '*',
    ASSIGNMENT = '='

class ParsedOp(dict):
    def __init__(self, op: SupportedOp, *values):
        dict.__init__(self, op=op, values=values)

Parser:

class AParser(Parser):
debugfile = 'parser.out'
tokens = ALexer.tokens

precedence = (
      #('nonassoc', LESSTHAN, GREATERTHAN),  # Nonassociative operators
      ('left', PLUS, MINUS),
      ('left', TIMES, DIVIDE)
      #('right', UMINUS),            # Unary minus operator
 )

@_('statement_list')
def program(self, p):
    print('program', p[0])
    return ParsedOp(SupportedOp.PROGRAM, p[0])

@_('statement BACK_IN_LINES statement_list')
def statement_list(self, p):
    print('statement_list', p[0], p[1], p[2])
    lst: list = p[1].values[0]
    lst.append(p[0])
    return ParsedOp(SupportedOp.STATEMENT_LIST, lst)

@_('statement')
def statement_list(self, p):
    print('statement_list', p[0])
    return ParsedOp(SupportedOp.STATEMENT_LIST, [p[0]])

@_('empty')
def statement_list(self, p):
    print('empty statement_list')
    return ParsedOp(SupportedOp.STATEMENT_LIST, [])

@_('NL BACK_IN_LINES', 'NL')
def BACK_IN_LINES(self, p):
    print('BACK_IN_LINES', p[0])
    #unused
    return 'NL'

@_('assignment', 'expr')
def statement(self, p):
    print('statement', p[0])
    return ParsedOp(SupportedOp.STATEMENT, p[0])

@_('ID ASSIGN expr')
def assignment(self, p):
    print('assignment', p[0], p[1], p[2])
    return ParsedOp(SupportedOp.ASSIGNMENT, p[0], p[2])

@_('expr COMMA expr_list')
def expr_list(self, p):
    print('expr_list', p[0], p[1], p[2])
    lst: list = p[1].values[0]
    lst.append(p[0])
    return ParsedOp(SupportedOp.ARGS_LIST, lst)

@_('expr')
def expr_list(self, p):
    print('expr_list', p[0])
    return ParsedOp(SupportedOp.ARGS_LIST, [p[0]])

@_('empty')
def expr_list(self, p):
    print('empty expr_list')
    return ParsedOp(SupportedOp.ARGS_LIST, [])

@_('constant')
def expr(self, p):
    print('expr', p[0])
    return p[0]

@_('ID')
def expr(self, p):
    print('expr', p[0])
    return ParsedOp(SupportedOp.VARIABLE, p[0])

@_('LPAREN expr RPAREN')
def expr(self, p):
    print('expr', p[0], p[1], p[2])
    return p[1]

@_('ID LPAREN expr_list RPAREN')
def expr(self, p):
    print('expr', p[0], p[1], p[2], p[3])
    #if exists p.ID in functions
    return ParsedOp(SupportedOp.FUNC_CALL, p.ID, p.expr_list)

@_( 'expr PLUS expr',
    'expr MINUS expr',
    'expr TIMES expr',
    'expr DIVIDE expr')
def expr(self, p):
    print('expr', p[0], p[1], p[2])
    return ParsedOp(SupportedOp(p[1]), p[0], p[2])

@_('NUMBER', 'STRING', 'BOOL')
def constant(self, p):
    print('constant', p[0])
    return ParsedOp(SupportedOp.CONSTANT, p[0])

@_('')
def empty(self, p):
    print('empty')
    pass

def error(self, p):
    if p:
        print("Syntax error at token", p.type)
        # Just discard the token and tell the parser it's okay.
        self.errok()
    else:
        print("Syntax error at EOF")

I don't know if all my rules are ok but I commented every rule leaving uncommented only the "constant" rule which is straightforward, still can't recognize it.

Main:

tokens_out = lexer.ALexer().tokenize(event.get("code"))
tree = AParser().parse(tokens_out)

All my tokens are well recognized so the Lexer is ok. The parser can't recognize any rule. Any idea?

CodePudding user response：

As far as I can see, your code works fine up to the point at which you attempt to parse the second statement. I tried it with the input x=2 as suggested in your comment, and it produced the following result (pretty-printed with the pprint module):

{ 'op': <SupportedOp.PROGRAM: 'PROGRAM'>,
  'values': ( { 'op': <SupportedOp.STATEMENT_LIST: 'STATEMENT_LIST'>,
                'values': ( [ { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'x',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 2,)})},)}],)},)}

But once you try to parse two statements --say x=2 and y=2 on separate lines-- things start to fall apart.

There are several issues, which I'll try to deal with one at a time.

The first problem is that your NL lexer rule does not return anything, which means that no token is emitted. That means that the BACK_IN_LINES rule cannot match, because it is expecting to see one or more NL tokens. This generates a cascade of parser errors, which you should have seen on your console.

It's easy to get confused about the parser's progress. Since the parser doesn't see the NL token, what it sees is x = 2 y .... At the moment at which it sees the y, it hasn't yet reduced expr; after all, the next token might have been a or other operator. But an ID is not one of the possible tokens which can follow 2, so a syntax error is thrown immediately, with the result that expr, statement and statement_list are not reduced, and so their reduction actions never execute, and thus you never see any of the debugging output you've put in the reduction actions. That doesn't really mean that the rule was not recognised; it would be more accurate to say that the rule was recognised but a longer match was still possible.

If you fix the lexer by adding return t at the end of the NL rule, then you run into the second problem, which is the reduction action for statement_list: statement BACK_IN_LINES statement_list. This bug causes the parse to fail even for an input consisting only of x=2, if that input is terminated with a newline character. The action is triggered because BACK_IN_LINES is now recognised, so the grammar requires that the statement_list has two subcomponents, a statement (x=2) and a statement_listconsisting of the rest of the input. The rest of the input is empty, which is OK because you allow astatement_list` to match an empty input. But, as I said, that causes the reduction action to execute, and that reduction action includes:

lst: list = p[1].values[0]

That line has several problems. First, p[1] is the BACK_IN_LINES grammar symbol; you obviously intended to use p[2], which is the statement_list. So p[1] is the string NL, which doesn't have a values attribute. (That should be evident from the Python traceback, assuming that you can see the exception tracebacks. If you can't, you should seriously consider debugging in a Python console instead of whatever you are using.)

But when we fix it to use p[2] instead, we still get a Python TypeError exception. Instead of complaining that a str has no values attribute, it now complains that a builtin_function_or_method is not subscriptable. The builtin_function_or_method is the values method of a dict object, becasue p[2] is a SupportedOp, which is subclassed from dict. If you had intended to use the values method, you would have had to have called it, but I don't think that was what you intended (and therefore the values key is perhaps badly named). What you actually meant was the value associated with the values key, which means that the line should read:

lst: list = p[2]["values"][0]

But that's still a bit problematic. Let's take a look at the parsed result of the input consisting of three lines:

first = 1
second = 2
third = 3

That produces (again, pretty printed with pprint):

{ 'op': <SupportedOp.PROGRAM: 'PROGRAM'>,
  'values': ( { 'op': <SupportedOp.STATEMENT_LIST: 'STATEMENT_LIST'>,
                'values': ( [ { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'third',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 3,)})},)},
                              { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'second',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 2,)})},)},
                              { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'first',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 1,)})},)}],)},)}

If you look carefully at that output, you'll see that the three statements are indeed there, but in the wrong order.

That's the result of your choosing to use a right-recursive rule for statement_list:

statement_list: statement BACK_IN_LINES statement_list

Right-recursive rules are practically never correct for bottom-up parsers (such as the LALR(1) parser generated by Sly), although there are times when they are needed. If you can choose between a left-recursive rule and a right-recursive rule, your best option is the left-recursive rule:

statement_list: statement_list BACK_IN_LINES statement

This has two advantages. First, it doesn't require use of the parser stack to hold all the intermediate and incomplete subparses of statement_list until the end of the list is reached. More importantly, it executes the reduction actions for the recursive non-terminal in the order you probably expect them to be executed, which is left-to-right.

The right-recursive rule executes the reduction actions right-to-left, because the first recursive reduction is the last nested statement_list. And the consequence is that which we have already seen: the statements are appended to the statement_list in reverse order, starting with the last one.

It's easy to rewrite the rule as left-recursive. But if we do a naive change, we'll end up with a different problem. It's (probably) desirable that the input is allowed to but not required to end with a newline. That worked with the right-recursive production because statement_list was allowed to be empty. But that won't help with the left-recursive rule; a left-recursive rule with an empty option puts the empty option at the beginning of the list (precisely because it produces the list left-to-right). With the left-recursive rule, it's more convenient to allow statement to be empty. But instead of adding an "empty statement", which will clutter up the AST, we can just add a rule to statement_list which accepts an empty sequence instead of a statement. Once we do that, we eliminate the need for BACK_IN_LINES, because the grammatical logic is kept in statement_list.

So we can remove BACK_IN_LINES and replace the rules for statement_list with the following:

    @_('statement')
    def statement_list(self, p):
        print('statement_list', p[0])
        return ParsedOp(SupportedOp.STATEMENT_LIST, [p[0]])
    
    @_('empty')
    def statement_list(self, p):
        print('empty statement_list')
        return ParsedOp(SupportedOp.STATEMENT_LIST, [])
    
    @_('statement_list NL')
    def statement_list(self, p):
        print('statement_list', p[0])
        return p[0]
    
    @_('statement_list NL statement')
    def statement_list(self, p):
        print('statement_list', p[0], p[1], p[2])
        lst: list = p[0]["values"][0]
        lst.append(p[2])
        return ParsedOp(SupportedOp.STATEMENT_LIST, lst)

Now, finally, we get the expected parse:

{ 'op': <SupportedOp.PROGRAM: 'PROGRAM'>,
  'values': ( { 'op': <SupportedOp.STATEMENT_LIST: 'STATEMENT_LIST'>,
                'values': ( [ { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'first',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 1,)})},)},
                              { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'second',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 2,)})},)},
                              { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
                                'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
                                              'values': ( 'third',
                                                          { 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
                                                            'values': ( 3,)})},)}],)},)}