I'm working on making a simple interpreted programming language using SLY to generate a AST which I will interpret without using SLY.
Currently I have been able to generate all my tokens and giving them to the parser, but it can't recognize any rule, only empty ones.
Lexer:
from .sly.lex import Lexer
class ALexer(Lexer):
tokens = { ID, NUMBER, STRING, BOOL, PLUS, TIMES, MINUS, DIVIDE, ASSIGN, LPAREN, RPAREN, COMMA, NL }
ignore = ' \t'
# Tokens
@_(r'\d [[.]?\d*[f]?]?')
def NUMBER(self, t):
endswithF = t.value.endswith('f')
isfloat = endswithF or t.value.find('.') != -1
t.value = float(t.value[:-1] if endswithF else t.value) if isfloat else int(t.value) # Convert to a numeric value
return t
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
ID['true'] = BOOL
ID['false'] = BOOL
@_(r'".*"')
def STRING(self, t):
t.value = t.value[1:-1]
return t
# Special symbols
PLUS = r'\ '
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
COMMA = r','
@_(r'true', r'false')
def BOOL(self, t):
t.value = t.value == 'true'
return t
@_(r'\n')
def NL(self, t):
self.lineno = 1
def error(self, t):
print("Illegal character '%s'" % t.value[0])
self.index = 1
Util classes:
from enum import Enum
class SupportedOp(str, Enum):
CONSTANT = "CONSTANT",
VARIABLE = "VARIABLE",
ARGS_LIST = "ARGS_LIST",
FUNC_CALL = "FUNC_CALL",
STATEMENT = "STATEMENT",
STATEMENT_LIST = "STATEMENT_LIST",
PROGRAM = "PROGRAM",
SUM = ' ',
SUB = '-',
DIV = '/',
MUL = '*',
ASSIGNMENT = '='
class ParsedOp(dict):
def __init__(self, op: SupportedOp, *values):
dict.__init__(self, op=op, values=values)
Parser:
class AParser(Parser):
debugfile = 'parser.out'
tokens = ALexer.tokens
precedence = (
#('nonassoc', LESSTHAN, GREATERTHAN), # Nonassociative operators
('left', PLUS, MINUS),
('left', TIMES, DIVIDE)
#('right', UMINUS), # Unary minus operator
)
@_('statement_list')
def program(self, p):
print('program', p[0])
return ParsedOp(SupportedOp.PROGRAM, p[0])
@_('statement BACK_IN_LINES statement_list')
def statement_list(self, p):
print('statement_list', p[0], p[1], p[2])
lst: list = p[1].values[0]
lst.append(p[0])
return ParsedOp(SupportedOp.STATEMENT_LIST, lst)
@_('statement')
def statement_list(self, p):
print('statement_list', p[0])
return ParsedOp(SupportedOp.STATEMENT_LIST, [p[0]])
@_('empty')
def statement_list(self, p):
print('empty statement_list')
return ParsedOp(SupportedOp.STATEMENT_LIST, [])
@_('NL BACK_IN_LINES', 'NL')
def BACK_IN_LINES(self, p):
print('BACK_IN_LINES', p[0])
#unused
return 'NL'
@_('assignment', 'expr')
def statement(self, p):
print('statement', p[0])
return ParsedOp(SupportedOp.STATEMENT, p[0])
@_('ID ASSIGN expr')
def assignment(self, p):
print('assignment', p[0], p[1], p[2])
return ParsedOp(SupportedOp.ASSIGNMENT, p[0], p[2])
@_('expr COMMA expr_list')
def expr_list(self, p):
print('expr_list', p[0], p[1], p[2])
lst: list = p[1].values[0]
lst.append(p[0])
return ParsedOp(SupportedOp.ARGS_LIST, lst)
@_('expr')
def expr_list(self, p):
print('expr_list', p[0])
return ParsedOp(SupportedOp.ARGS_LIST, [p[0]])
@_('empty')
def expr_list(self, p):
print('empty expr_list')
return ParsedOp(SupportedOp.ARGS_LIST, [])
@_('constant')
def expr(self, p):
print('expr', p[0])
return p[0]
@_('ID')
def expr(self, p):
print('expr', p[0])
return ParsedOp(SupportedOp.VARIABLE, p[0])
@_('LPAREN expr RPAREN')
def expr(self, p):
print('expr', p[0], p[1], p[2])
return p[1]
@_('ID LPAREN expr_list RPAREN')
def expr(self, p):
print('expr', p[0], p[1], p[2], p[3])
#if exists p.ID in functions
return ParsedOp(SupportedOp.FUNC_CALL, p.ID, p.expr_list)
@_( 'expr PLUS expr',
'expr MINUS expr',
'expr TIMES expr',
'expr DIVIDE expr')
def expr(self, p):
print('expr', p[0], p[1], p[2])
return ParsedOp(SupportedOp(p[1]), p[0], p[2])
@_('NUMBER', 'STRING', 'BOOL')
def constant(self, p):
print('constant', p[0])
return ParsedOp(SupportedOp.CONSTANT, p[0])
@_('')
def empty(self, p):
print('empty')
pass
def error(self, p):
if p:
print("Syntax error at token", p.type)
# Just discard the token and tell the parser it's okay.
self.errok()
else:
print("Syntax error at EOF")
I don't know if all my rules are ok but I commented every rule leaving uncommented only the "constant" rule which is straightforward, still can't recognize it.
Main:
tokens_out = lexer.ALexer().tokenize(event.get("code"))
tree = AParser().parse(tokens_out)
All my tokens are well recognized so the Lexer is ok. The parser can't recognize any rule. Any idea?
CodePudding user response:
As far as I can see, your code works fine up to the point at which you attempt to parse the second statement. I tried it with the input x=2
as suggested in your comment, and it produced the following result (pretty-printed with the pprint
module):
{ 'op': <SupportedOp.PROGRAM: 'PROGRAM'>,
'values': ( { 'op': <SupportedOp.STATEMENT_LIST: 'STATEMENT_LIST'>,
'values': ( [ { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'x',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 2,)})},)}],)},)}
But once you try to parse two statements --say x=2
and y=2
on separate lines-- things start to fall apart.
There are several issues, which I'll try to deal with one at a time.
The first problem is that your NL
lexer rule does not return anything, which means that no token is emitted. That means that the BACK_IN_LINES
rule cannot match, because it is expecting to see one or more NL
tokens. This generates a cascade of parser errors, which you should have seen on your console.
It's easy to get confused about the parser's progress. Since the parser doesn't see the NL
token, what it sees is x = 2 y ...
. At the moment at which it sees the y
, it hasn't yet reduced expr
; after all, the next token might have been a
or other operator. But an ID
is not one of the possible tokens which can follow 2
, so a syntax error is thrown immediately, with the result that expr
, statement
and statement_list
are not reduced, and so their reduction actions never execute, and thus you never see any of the debugging output you've put in the reduction actions. That doesn't really mean that the rule was not recognised; it would be more accurate to say that the rule was recognised but a longer match was still possible.
If you fix the lexer by adding return t
at the end of the NL
rule, then you run into the second problem, which is the reduction action for statement_list: statement BACK_IN_LINES statement_list
. This bug causes the parse to fail even for an input consisting only of x=2
, if that input is terminated with a newline character. The action is triggered because BACK_IN_LINES
is now recognised, so the grammar requires that the statement_list has two subcomponents, a
statement (
x=2) and a
statement_listconsisting of the rest of the input. The rest of the input is empty, which is OK because you allow a
statement_list` to match an empty input. But, as I said, that causes the reduction action to execute, and that reduction action includes:
lst: list = p[1].values[0]
That line has several problems. First, p[1]
is the BACK_IN_LINES
grammar symbol; you obviously intended to use p[2]
, which is the statement_list
. So p[1]
is the string NL
, which doesn't have a values
attribute. (That should be evident from the Python traceback, assuming that you can see the exception tracebacks. If you can't, you should seriously consider debugging in a Python console instead of whatever you are using.)
But when we fix it to use p[2]
instead, we still get a Python TypeError exception. Instead of complaining that a str
has no values
attribute, it now complains that a builtin_function_or_method
is not subscriptable. The builtin_function_or_method
is the values
method of a dict
object, becasue p[2]
is a SupportedOp
, which is subclassed from dict
. If you had intended to use the values
method, you would have had to have called it, but I don't think that was what you intended (and therefore the values
key is perhaps badly named). What you actually meant was the value associated with the values
key, which means that the line should read:
lst: list = p[2]["values"][0]
But that's still a bit problematic. Let's take a look at the parsed result of the input consisting of three lines:
first = 1
second = 2
third = 3
That produces (again, pretty printed with pprint
):
{ 'op': <SupportedOp.PROGRAM: 'PROGRAM'>,
'values': ( { 'op': <SupportedOp.STATEMENT_LIST: 'STATEMENT_LIST'>,
'values': ( [ { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'third',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 3,)})},)},
{ 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'second',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 2,)})},)},
{ 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'first',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 1,)})},)}],)},)}
If you look carefully at that output, you'll see that the three statements are indeed there, but in the wrong order.
That's the result of your choosing to use a right-recursive rule for statement_list
:
statement_list: statement BACK_IN_LINES statement_list
Right-recursive rules are practically never correct for bottom-up parsers (such as the LALR(1) parser generated by Sly), although there are times when they are needed. If you can choose between a left-recursive rule and a right-recursive rule, your best option is the left-recursive rule:
statement_list: statement_list BACK_IN_LINES statement
This has two advantages. First, it doesn't require use of the parser stack to hold all the intermediate and incomplete subparses of statement_list
until the end of the list is reached. More importantly, it executes the reduction actions for the recursive non-terminal in the order you probably expect them to be executed, which is left-to-right.
The right-recursive rule executes the reduction actions right-to-left, because the first recursive reduction is the last nested statement_list
. And the consequence is that which we have already seen: the statement
s are appended to the statement_list
in reverse order, starting with the last one.
It's easy to rewrite the rule as left-recursive. But if we do a naive change, we'll end up with a different problem. It's (probably) desirable that the input is allowed to but not required to end with a newline. That worked with the right-recursive production because statement_list
was allowed to be empty. But that won't help with the left-recursive rule; a left-recursive rule with an empty
option puts the empty
option at the beginning of the list (precisely because it produces the list left-to-right). With the left-recursive rule, it's more convenient to allow statement
to be empty. But instead of adding an "empty statement", which will clutter up the AST, we can just add a rule to statement_list
which accepts an empty sequence instead of a statement. Once we do that, we eliminate the need for BACK_IN_LINES
, because the grammatical logic is kept in statement_list
.
So we can remove BACK_IN_LINES
and replace the rules for statement_list
with the following:
@_('statement')
def statement_list(self, p):
print('statement_list', p[0])
return ParsedOp(SupportedOp.STATEMENT_LIST, [p[0]])
@_('empty')
def statement_list(self, p):
print('empty statement_list')
return ParsedOp(SupportedOp.STATEMENT_LIST, [])
@_('statement_list NL')
def statement_list(self, p):
print('statement_list', p[0])
return p[0]
@_('statement_list NL statement')
def statement_list(self, p):
print('statement_list', p[0], p[1], p[2])
lst: list = p[0]["values"][0]
lst.append(p[2])
return ParsedOp(SupportedOp.STATEMENT_LIST, lst)
Now, finally, we get the expected parse:
{ 'op': <SupportedOp.PROGRAM: 'PROGRAM'>,
'values': ( { 'op': <SupportedOp.STATEMENT_LIST: 'STATEMENT_LIST'>,
'values': ( [ { 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'first',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 1,)})},)},
{ 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'second',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 2,)})},)},
{ 'op': <SupportedOp.STATEMENT: 'STATEMENT'>,
'values': ( { 'op': <SupportedOp.ASSIGNMENT: '='>,
'values': ( 'third',
{ 'op': <SupportedOp.CONSTANT: 'CONSTANT'>,
'values': ( 3,)})},)}],)},)}