I have successfully created a scanner for a parser which in turn will be used with a code generator to create a complete compiler. My parser right now can successfully parse assignments, if, if-else, while-do
, comparisons, read and write
commands. The only problem I have is when trying to assign something after a while-do
or if/if-else
it fails.
Update: I can actually assign variables after an if\if else
and while-do
statements, but another problem has been found. I cannot make any nested if/if else
or while-do
statements. So now the problem is how to fix nested boolean and loop statements.
The way my parser works with if-then
statements is:
if z < 4 then
z := z + 7
end
What is written out is:
Statements
If
LESS
z
4
Statements
Assign
z
ADD
z
7
It recognizes the if and runs a method called if_statement()
which scans the if
token, sets the comparison equal to a variable, scans the then
token, and sets the statements after then to a variable. Next, in order to accommodate for both if
and if-else
statements, I use a method called lookahead() which looks at the next token. If it sees end
, it returns the variables to be written to a text document. If it sees else
, it the scans the else
token, and assigns the statements after to a variable, then looks for the end
token. Then it returns all the variables to be written to a text document.
The code for just this method is as follows:
def if_statement():
scanner.consume(Token.IF)
condition = comparison()
scanner.consume(Token.THEN)
then = statements()
if scanner.lookahead() == Token.END:
scanner.consume(Token.END)
return If_AST(condition, then)
else:
scanner.consume(Token.ELSE)
otherwise = statements()
scanner.consume(Token.END)
return If_Else_AST(condition, then, otherwise)
If_AST and If_Else_AST are as follows:
class If_AST:
def __init__(self, condition, then):
self.condition = condition
self.then = then
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' + \
repr(self.then) + ' end'
def indented(self, level):
return indent('If', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1)
class If_Else_AST:
def __init__(self, condition, then, otherwise):
self.condition = condition;
self.then = then;
self.otherwise = otherwise;
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' +\
repr(self.then) + ' else ' + \
repr(self.otherwise) + ' end'
def indented(self, level):
return indent('If-Else', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1) + \
self.otherwise.indented(level+1)
For while-do
, it instead replaces if
with while
and then
with do
. But for good measure, I will also include the while method and class:
def while_statement():
scanner.consume(Token.WHILE)
condition = comparison()
scanner.consume(Token.DO)
body = statements()
scanner.consume(Token.END)
return While_AST(condition, body)
class While_AST:
def __init__(self, condition, body):
self.condition = condition
self.body = body
def __repr__(self):
return 'while ' + repr(self.condition) + ' do ' + \
repr(self.body) + ' end'
def indented(self, level):
return indent('While', level) + \
self.condition.indented(level+1) + \
self.body.indented(level+2)
I realize that my parser does not support else if
statements. Else if
statements are not part of this exercise.
And lastly, just in case someone needs to see everything, my full code so far is as follows:
import re
import sys
class Scanner:
'''The interface comprises the methods lookahead and consume.
Other methods should not be called from outside of this class.'''
def __init__(self, input_file):
'''Reads the whole input_file to input_string, which remains constant.
current_char_index counts how many characters of input_string have
been consumed.
current_token holds the most recently found token and the
corresponding part of input_string.'''
# source code of the program to be compiled
self.input_string = input_file.read()
# index where the unprocessed part of input_string starts
self.current_char_index = 0
# a pair (most recently read token, matched substring of input_string)
self.current_token = self.get_token()
def skip_white_space(self):
'''Consumes all characters in input_string up to the next
non-white-space character.'''
if (self.current_char_index >= len(self.input_string) - 1):
return
while self.input_string[self.current_char_index].isspace():
self.current_char_index += 1
def get_token(self):
'''Returns the next token and the part of input_string it matched.
The returned token is None if there is no next token.
The characters up to the end of the token are consumed.'''
self.skip_white_space()
# find the longest prefix of input_string that matches a token
token, longest = None, ''
for (t, r) in Token.token_regexp:
match = re.match(r, self.input_string[self.current_char_index:])
if match and match.end() > len(longest):
token, longest = t, match.group()
# consume the token by moving the index to the end of the matched part
self.current_char_index += len(longest)
return (token, longest)
def lookahead(self):
'''Returns the next token without consuming it.
Returns None if there is no next token.'''
return self.current_token[0]
def consume(self, *tokens):
'''Returns the next token and consumes it, if it is in tokens.
Raises an exception otherwise.
If the token is a number or an identifier, its value is returned
instead of the token.'''
current = self.current_token
if (len(self.input_string[self.current_char_index:]) == 0):
self.current_token = (None, '')
else:
self.current_token = self.get_token()
if current[0] in tokens:
if current[0] is Token.ID:
#print("ID")
return current[1]
elif current[0] is Token.NUM:
#print("NUM")
return current[1]
else:
return current[0]
else:
raise Exception('non-token detected')
class Token:
# The following enumerates all tokens.
DO = 'DO'
ELSE = 'ELSE'
READ = 'READ'
WRITE = 'WRITE'
END = 'END'
IF = 'IF'
THEN = 'THEN'
WHILE = 'WHILE'
SEM = 'SEM'
BEC = 'BEC'
LESS = 'LESS'
EQ = 'EQ'
GRTR = 'GRTR'
LEQ = 'LEQ'
NEQ = 'NEQ'
GEQ = 'GEQ'
ADD = 'ADD'
SUB = 'SUB'
MUL = 'MUL'
DIV = 'DIV'
LPAR = 'LPAR'
RPAR = 'RPAR'
NUM = 'NUM'
ID = 'ID'
# The following list gives the regular expression to match a token.
# The order in the list matters for mimicking Flex behaviour.
# Longer matches are preferred over shorter ones.
# For same-length matches, the first in the list is preferred.
token_regexp = [
(DO, 'do'),
(ELSE, 'else'),
(READ, 'read'),
(WRITE, 'write'),
(END, 'end'),
(IF, 'if'),
(THEN, 'then'),
(WHILE, 'while'),
(SEM, ';'),
(BEC, ':='),
(LESS, '<'),
(EQ, '='),
(NEQ, '!='),
(GRTR, '>'),
(LEQ, '<='),
(GEQ, '>='),
(ADD, '[+]'), # + is special in regular expressions
(SUB, '-'),
(MUL, '[*]'),
(DIV, '[/]'),
(LPAR, '[(]'), # ( is special in regular expressions
(RPAR, '[)]'), # ) is special in regular expressions
(ID, '[a-z]+'),
(NUM, '[0-9]+'),
]
def indent(s, level):
return ' '*level + s + '\n'
# Each of the following classes is a kind of node in the abstract syntax tree.
# indented(level) returns a string that shows the tree levels by indentation.
class Program_AST:
def __init__(self, program):
self.program = program
def __repr__(self):
return repr(self.program)
def indented(self, level):
return self.program.indented(level)
class Statements_AST:
def __init__(self, statements):
self.statements = statements
def __repr__(self):
result = repr(self.statements[0])
for st in self.statements[1:]:
result += '; ' + repr(st)
return result
def indented(self, level):
result = indent('Statements', level)
for st in self.statements:
result += st.indented(level+1)
return result
class If_AST:
def __init__(self, condition, then):
self.condition = condition
self.then = then
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' + \
repr(self.then) + ' end'
def indented(self, level):
return indent('If', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1)
class If_Else_AST:
def __init__(self, condition, then, otherwise):
self.condition = condition;
self.then = then;
self.otherwise = otherwise;
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' +\
repr(self.then) + ' else ' + \
repr(self.otherwise) + ' end'
def indented(self, level):
return indent('If-Else', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1) + \
self.otherwise.indented(level+1)
class While_AST:
def __init__(self, condition, body):
self.condition = condition
self.body = body
def __repr__(self):
return 'while ' + repr(self.condition) + ' do ' + \
repr(self.body) + ' end'
def indented(self, level):
return indent('While', level) + \
self.condition.indented(level+1) + \
self.body.indented(level+2)
class Assign_AST:
def __init__(self, identifier, expression):
self.identifier = identifier
self.expression = expression
def __repr__(self):
return repr(self.identifier) + ':=' + repr(self.expression)
def indented(self, level):
return indent('Assign', level) + \
self.identifier.indented(level+1) + \
self.expression.indented(level+1)
class Write_AST:
def __init__(self, expression):
self.expression = expression
def __repr__(self):
return 'write ' + repr(self.expression)
def indented(self, level):
return indent('Write', level) + self.expression.indented(level+1)
class Read_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return 'read ' + repr(self.identifier)
def indented(self, level):
return indent('Read', level) + self.identifier.indented(level+1)
class Comparison_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.LESS:'<', Token.EQ:'=', Token.GRTR:'>',
Token.LEQ:'<=', Token.NEQ:'!=', Token.GEQ:'>=' }
return repr(self.left) + op[self.op] + repr(self.right)
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
class Expression_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.ADD:'+', Token.SUB:'-', Token.MUL:'*', Token.DIV:'/' }
return '(' + repr(self.left) + op[self.op] + repr(self.right) + ')'
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
class Number_AST:
def __init__(self, number):
self.number = number
def __repr__(self):
return self.number
def indented(self, level):
return indent(self.number, level)
class Identifier_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return self.identifier
def indented(self, level):
return indent(self.identifier, level)
# The following methods comprise the recursive-descent parser.
def program():
sts = statements()
return Program_AST(sts)
def statements():
result = [statement()]
while scanner.lookahead() == Token.SEM:
scanner.consume(Token.SEM)
st = statement()
result.append(st)
return Statements_AST(result)
def statement():
if scanner.lookahead() == Token.IF:
return if_statement()
elif scanner.lookahead() == Token.WHILE:
return while_statement()
elif scanner.lookahead() == Token.ID:
return assignment()
elif scanner.lookahead() == Token.READ:
return read();
elif scanner.lookahead() == Token.WRITE:
return write();
else: # error
return scanner.consume(Token.IF, Token.WHILE, Token.ID)
def if_statement():
scanner.consume(Token.IF)
condition = comparison()
scanner.consume(Token.THEN)
then = statements()
if scanner.lookahead() == Token.END:
scanner.consume(Token.END)
return If_AST(condition, then)
else:
scanner.consume(Token.ELSE)
otherwise = statements()
scanner.consume(Token.END)
return If_Else_AST(condition, then, otherwise)
def while_statement():
scanner.consume(Token.WHILE)
condition = comparison()
scanner.consume(Token.DO)
body = statements()
scanner.consume(Token.END)
return While_AST(condition, body)
def assignment():
ident = identifier()
scanner.consume(Token.BEC)
expr = expression()
return Assign_AST(ident, expr)
def read():
scanner.consume(Token.READ)
i = identifier()
return Read_AST(i)
def comparison():
left = expression()
op = scanner.consume(Token.LESS, Token.EQ, Token.GRTR,
Token.LEQ, Token.NEQ, Token.GEQ)
right = expression()
return Comparison_AST(left, op, right)
def write():
scanner.consume(Token.WRITE)
e = expression()
return Write_AST(e)
def expression():
result = term()
while scanner.lookahead() in [Token.ADD, Token.SUB]:
op = scanner.consume(Token.ADD, Token.SUB)
tree = term()
result = Expression_AST(result, op, tree)
return result
def term():
result = factor()
while scanner.lookahead() in [Token.MUL, Token.DIV]:
op = scanner.consume(Token.MUL, Token.DIV)
tree = factor()
result = Expression_AST(result, op, tree)
return result
def factor():
if scanner.lookahead() == Token.LPAR:
scanner.consume(Token.LPAR)
result = expression()
scanner.consume(Token.RPAR)
return result
elif scanner.lookahead() == Token.NUM:
value = scanner.consume(Token.NUM)
return Number_AST(value)
elif scanner.lookahead() == Token.ID:
return identifier()
else: # error
return scanner.consume(Token.LPAR, Token.NUM, Token.ID)
def identifier():
value = scanner.consume(Token.ID)
return Identifier_AST(value)
# Initialise scanner.
scanner = Scanner(sys.stdin)
# Show all tokens in the input.
#token = scanner.lookahead()
#test = ''
#
#while token != None:
# print(scanner.consume(token))
# token = scanner.lookahead()
#
#Call the parser.
ast = program()
if scanner.lookahead != None:
print(ast.indented(0))
exit()
#if scanner.lookahead() != None:
# raise Exception('end of input expected but token ' + repr(scanner.lookahead()) + ' found')
# Show the syntax tree with levels indicated by indentation.
Any hints on how to fix my code would be awesome. Thanks!