%% Wren tokenier and parser from chapter 2 of the formal semantics
%% text (Slonneger and Kurtz 1995) with customization needed for the
%% full version and including comments and other changes for readability.
%%
%% Tom O'Hara
%% CS 571
%%
%% Slonneger, Kenneth and Barry L. Kurtz (1995), Formal Syntax and Semantics
%% of Programming Languages, Reading, MA: Addison-Wesley.
%%

%% TODO:
%% - make sure that all of the BNF is included in the code comments

%%------------------------------------------------------------------------
%% Scanner
%%
%% Tokenizes the input from the terminal or a file and produces a list
%% of tokens.

%%........................................................................
%% character type tests
%%
%% Predicate		Test: Is the character ...
%% lower(+Char)		lowercase
%% upper(+Char)		uppercase
%% digit(+Char)		a number
%% space(+Char)		the space character
%% tabch(+Char)		the tab character
%% period(+Char)	'.' (i.e., full-stop)
%% slash(+Char)		forward slash ('/')
%% endline(+Char)	newline
%% endfile(+Char)	end-of-file indicator
%%

lower(Char) :- 97 =< Char, Char =< 122. 	% a-z 

upper(Char) :- 65 =< Char, Char =< 90. 		% A-Z 

digit(Char) :- 48 =< Char, Char =< 57. 		% 0-9 

space(32). 

tabch(9). 

period(46). 

slash(47). 

endline(10). 
endfile(26). 
endfile(-1). 

whitespace(Char) :- 
	space(Char) ; tabch(Char) ; endline(Char). 

idchar(Char) :- 
	lower(Char) ; digit(Char).

%%........................................................................
%% Scanner proper

%% scan(-TokenList): returns list of tokens from input
%%
scan([Token | TokenList]) :- 
	tab(4), 		% output 4 spaces
	getch(Char), 
	gettoken(Char, Token, NextChar), 
	!,			% don't backtrack if subsequent errors found
	restprog(Token, NextChar, TokenList). 

%% getch(-Char): returns the next character from the input and outputs
%% it to the terminal. 
%%
%% When the end of a line is reached, a newline is output followed by a 
%% tab. When the end of file is reached just a newline is output.
%%
getch(Char) :- 
	get0(Char), 
	(endline(Char), nl, tab(4) 
         ; 
	 endfile(Char), nl 
         ;
	 put(Char)).

%% restprog(+LastToken, +NextChar, -TokenList): given the previous token
%% and the next character in the input, return a list of tokens for the
%% remaining portion of the program.
%%
restprog(eop, _Char, []). 	% end of file reached with previous character

restprog(_LastToken, Char, [Token | TokenList]) :- 
	gettoken(Char, Token, NextChar), 
	!,			% don't backtrack if subsequent errors found
	restprog(Token, NextChar, TokenList).

%% single(+CharCode, -Token): determines whether the character represents
%% a token by itself and if so returns the token symbol
%%
single(40, lparen). 
single(41, rparen). 
single(42, times). 
single(43, plus). 
single(44, comma). 
single(45, minus). 
single(47, divides). 
single(59, semicolon). 
single(61, equal).

%% double(+CharCode, -Token): determines whether the character can either
%% represent the start of a double character token or token  by itself.
%% If so, the symbol for the token is returned.
%%
double(58, colon). 
double(60, less). 
double(62, grtr). 

%% pair(+Char1Code, +Char2Code, -Symbol): determines whether the two
%% characters form a token and if so returns the symbol.
%%
%% NOTE: pair(Char1, _Char2, _DoubleSymbol) <=> single(Char1, _SingleSymbol)
%%
pair(58, 61, assign). 	% := 
pair(60, 61, lteq). 	% <= 
pair(60, 62, neq). 	% <> 
pair(62, 61, gteq).	% >=


%% reswd(+Identifier): determines whether the identifier is a reserved word
%%
reswd(and). 
reswd(begin). 
reswd(boolean). 
reswd(do). 
reswd(else). 
reswd(end). 
reswd(false). 
reswd(if). 
reswd(integer). 
reswd(is). 
reswd(not).
reswd(or). 
reswd(program). 
reswd(read). 
reswd(skip). 
reswd(then). 
reswd(true). 
reswd(var). 
reswd(while). 
reswd(write). 

%%........................................................................
%% Token definition

%% gettoken(+LookAhead, -Token, -NextChar): given the current lookahead 
%% character, returns the next token and lookahead character
%%

%% gettoken(+LookAhead, -num(Number), -NextChar): case for numeric tokens
%%
gettoken(LookAhead, num(Number), NextChar) :- 
	digit(LookAhead), 
	getch(CurrentChar), 
	restnum(CurrentChar, CharList, NextChar), 
	name(Number, [LookAhead | CharList]).

%% gettoken(+LookAhead, -ide(Name), -NextChar): case for alphanumeric tokens
%% gettoken(+LookAhead, -Name, -NextChar): case for reserved words
%%
gettoken(LookAhead, Token, NextChar) :- 
	lower(LookAhead), 
	getch(CurrentChar), 
	restid(CurrentChar, CharList, NextChar),
	name(ID, [LookAhead | CharList]), 
	(reswd(ID),
	 Token = ID 
         ; 
	 Token = ide(ID)).

%% gettoken(+LookAhead, -Symbol, -NextChar): cases for special symbols
%% 
gettoken(LookAhead, Token , NextChar) :- 
	single(LookAhead, Token), 
	getch(NextChar).
%%
gettoken(LookAhead, Token, NextChar) :- 
	double(LookAhead, SimpleToken),
	getch(CurrentChar), 
	(pair(LookAhead, CurrentChar, Token),
	 getch(NextChar) 
         ; 
         Token = SimpleToken,
	 NextChar = CurrentChar).

%% special case handling for end-of-file and whitespace
%%
gettoken(LookAhead, eop, 0) :- 
	endfile(LookAhead).
%%
gettoken(LookAhead, Token, NextChar) :- 
	whitespace(LookAhead), 
	getch(CurrentChar), 
	gettoken(CurrentChar, Token, NextChar).

%% Display error message if illegal character encountered
%% NOTE: program execution is halted
%%
gettoken(LookAhead, _Token, _NextChar) :- 
	nl, write('Illegal character: '), 
	put(LookAhead), nl, 
	fail.


%% restnum(+LookAhead, -CharList, -NextChar): given that the current lookahead 
%% character is a digit, this returns the char list for the token, along with
%% the next lookahead character
%%
restnum(LookAhead, [LookAhead | CharList], NextChar) :- 
	digit(LookAhead), 
	getch(CurrentChar), 
	restnum(CurrentChar, CharList, NextChar). 

restnum(LookAhead, [], LookAhead). 	% end of number if not a digit


%% restid(+LookAhead, -CharList, -NextChar): given that the current lookahead 
%% character is valid for ID's, this returns the char list for the token, along
%% with the next lookahead character
%%
restid(LookAhead, [LookAhead | CharList], NextChar) :- 
	idchar(LookAhead), 
	getch(CurrentChar), 
	restid(CurrentChar, CharList, NextChar). 

restid(LookAhead, [], LookAhead). 	% end of identifier if not an ID char

%%........................................................................
%% Scanner interface

%% go_scan(): get the name of the Wren program and attempt to tokenize it.
%% The result is output to standard output.
%%
go_scan :- 
	nl, write('>>> Scanning Wren <<<'), nl, 
	nl, write('Enter name of source file: '), nl, 
	getfilename(FileName), nl,
	go_scan_aux(FileName).

%% go_scan_aux(+FileName): tokenize the Wren program in the specified file
%% 
go_scan_aux(FileName) :-
	see(FileName), 
	scan(Tokens), 
	seen, 
	write('Scan successful'), nl, nl, 
	write(Tokens), nl.


%% getfilename(-FileName): return a symvol for a filename taken
%% from standard input
%%
getfilename(FileName) :- 
	get0(LookAhead), 
	restfilename(LookAhead, CharList), 
	name(FileName, CharList). 


%% restfilename(+LookAhead, -CharList): given the current lookahead
%% character, return a list of characters forming a filename
%% 
restfilename(LookAhead, [LookAhead | CharList]) :- 
	filechar(LookAhead), 
	get0(CurrentChar), 
	restfilename(CurrentChar, CharList). 

restfilename(_LookAhead, []).


%% filechar(+Char): indicates whether the character can be part of a filename
%% (i.e., [a-zA-Z0-9_./])
%%
filechar(Char) :-
	lower(Char) ; upper(Char) ; digit(Char) ; period(Char) ; slash(Char).

%%------------------------------------------------------------------------
%% Parser
%%
%% DCG parser for the Wren language as defined in Figure 1.8 of the
%% formal semantics text (Sonneger and Kurtz 1995).
%%


%% <program> ::= program  <identifier> is <block>
%%
program(SyntaxTree) -->
	[program], [ide(_ID)], [is], 
	wren_block(SyntaxTree).


%% <block> ::= <declaration seq> begin  <command seq> end
%%
wren_block(prog(Declarations, Cmds)) --> 
	decs(Declarations), 
	[begin], cmds(Cmds), [end].

%%........................................................................
%% Declaration section

%% <declaration sequence> ::= e | <declaration> <declaration list>
%% <declaration> ::= var <variable list> : <type> ;

decs(Declarations) --> 
	dec(Declaration), 
	restdecs(Declaration, Declarations). 

decs([]) --> [].


restdecs(Declaration, [Declaration | Declarations]) --> 
	decs(Declarations). 

restdecs(Declaration, [Declaration]) --> [].


dec(dec(Type, Vars)) -->
	[var], varlist(Vars), 
	[colon], type(Type),
	[semicolon].

%% <variable list> ::= <variable> | <variable> , <variable list>
%% <variable> ::= <identifier>

varlist([Var | Vars]) --> 
	[ide(Var)], 
	restvarlist(Vars).


restvarlist(Vars) --> 
	[comma], varlist(Vars).
	
restvarlist([]) --> [].

%% <type> ::= integer | boolan

type(integer) --> [integer].

type(boolean) --> [boolean].

%% Note: the following parts of the BNF are handled by the tokenizer: 
%%
%% <identifier> ::= <letter> | <identifier> <letter> | <identifier> <digit>
%% <letter> ::= a | b | c | d | e | f | g | h | i | j | k | l | m
%%            | n | o | p | q | r | s | t | u | v | w | x | y | z
%% <numeral> ::= <digit> | <digit> <numeral>
%% <digit> ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
%%

%%........................................................................
%% Commands

%% <command seq> ::= <command> | <command> ; <command seq> 
%%
cmds(Cmds) --> 
	command(Cmd), 
	restcmds(Cmd, Cmds). 


restcmds(Cmd, [Cmd | Cmds]) --> 
	[semicolon], 
	cmds(Cmds). 

restcmds(Cmd, [Cmd]) --> [].

%% <command> ::= <variable> := <expr> becomes
%%
command(assign(Var, Expr)) --> 
	[ide(Var)], [assign],
	expr(Expr).

%% <command> := skip
%%
command(skip) --> [skip].

%% <command> := read <variable> | write <integer expr>

command(read(Var)) --> 
	[read], [ide(Var)].

command(write(Expr)) --> 
	[write], intexpr(Expr).
	
%% <command> ::= while  <boolean expr> do <command seq> end while
%%
command(while(Test, Body)) -->
	[while], bool_expr(Test), [do], 
	cmds(Body), 
	[end, while].

%% <command> ::= if <boolean expr> then <command seq> end if   
%%             | if <boolean expr> then <command seq> else <command seq> end if

command(Cmd) -->
	[if], bool_expr(Test), [then], 
	cmds(Then), 
	restif(Test, Then, Cmd).


restif(Test, Then, if(Test, Then, Else)) --> 
	[else], cmds(Else), [end], [if]. 

restif(Test, Then, if(Test, Then)) --> 
	[end], [if].


%%........................................................................
%% Expressions


%% <expr> ::= <integer expr> | <boolean expr>

expr(Expr) --> 
	intexpr(Expr). 

expr(Expr) --> 
	bool_expr(Expr).


%% <integer expr> ::= <term> | <integer expr> <weak op> <term>
%%
%% result: Term -or- exp(Op, Expr1, Expr2)

intexpr(Expr) --> 
	term(Term), 
	restintexpr(Term, Expr). 


restintexpr(Term1, Expr) --> 
	weakop(Operator), 
	term(Term2), 
	restintexpr(exp(Operator, Term1, Term2), Expr). 

restintexpr(Expr, Expr) --> []. 


%% <term> ::= <element> | <term> <strong op> <element>
%%
%% <weak op> ::= + | -
%% <strong op> ::= * | /

term(Term) -->
	element(Element),
	restterm(Element, Term).


restterm(Element1, Term) --> 
	strongop(Operator), 
	element(Element2), 
	restterm(exp(Operator, Element1, Element2), Term). 

restterm(Term, Term) --> []. 


weakop(plus) --> [plus]. 
weakop(minus) --> [minus]. 


strongop(times) --> [times]. 
strongop(divides) --> [divides]. 

%% <element> ::= <numeral> | <variable> | (<integer expr) | - <element>

element(num(Number)) --> [num(Number)]. 

element(ide(ID)) --> [ide(ID)]. 

element(Expr) --> 
	[lparen], intexpr(Expr), [rparen]. 

element(minus(Expr)) --> 
	[minus], element(Expr).

%% <comparison> ::=  <integer expr> <relation> <integer expr>
%% <relation>   ::=  <= | < | = | > | >= | <>

comparison(bexp(Relation, Expr1, Expr2)) --> 
	intexpr(Expr1), 
	rel(Relation), 
	intexpr(Expr2). 

rel(equal) --> [equal]. 
rel(neq) --> [neq]. 
rel(less) --> [less]. 
rel(grtr) --> [grtr]. 
rel(gteq) --> [gteq]. 
rel(lteq) --> [lteq].

%% <bool expr> ::= <bool term> | <bool expr>  <weak bool op>  <bool term>
%% <bool term> ::= <bool element> | <bool term>  <strong bool op>  <bool element>
%%
%% <weak bool op> ::= or
%% <string bool op> ::= and
%%
%% where bool is short for boolean
%%
%% result: Term -or- bexp(Op, Expr1, Expr2)

bool_expr(Expr) --> 
	bool_term(Term), 
	rest_bool_expr(Term, Expr). 


rest_bool_expr(Term1, Expr) --> 
	weak_bool_op(Operator), 
	bool_term(Term2), 
	rest_bool_expr(bexp(Operator, Term1, Term2), Expr). 

rest_bool_expr(Expr, Expr) --> []. 


bool_term(Term) --> 
	bool_element(Element), 
	rest_bool_term(Element, Term). 

rest_bool_term(Element1, Term) --> 
	strong_bool_op(Operator), 
	bool_element(Element2), 
	rest_bool_term(bexp(Operator, Element1, Element2), Term). 

rest_bool_term(Term, Term) --> []. 


weak_bool_op(or) --> [or].
strong_bool_op(and) --> [and].


%% <boolean element> ::= true | false | <variable> | <comparison>
%%                    |  not ( <boolean expr> )  |  ( <boolean expr> )

bool_element(false) --> [false]. 
bool_element(true) --> [true].
bool_element(ide(Var)) --> [ide(Var)]. 

bool_element(Comparison) --> 
	comparison(Comparison).

bool_element(bnot(Expression)) -->
	[not], [lparen],
	bool_expr(Expression),
	[rparen].

bool_element(Expression) -->
	[lparen],
	bool_expr(Expression),
	[rparen].


%%........................................................................
%% Parser interface

%% go_parse(): invoke the Wren scanner and parser
%% 
go_parse :- 
	nl, write('>>> Interpreting: Wren <<<'), nl, 
	nl, write('Enter name of source file: '), nl, 
	getfilename(FileName), nl,
	go_parse_aux(FileName).

%% go_parse_aux(+FileName): scan and parse the specified Wren program
%%
go_parse_aux(FileName) :-
	see(FileName),
	scan(Tokens), 
	seen, write('Scan successful'), nl, 
	write(Tokens), nl, nl, 
	program(SyntaxTree, Tokens, [eop]), 
	write('Parse successful'), nl, 
	write(SyntaxTree), nl, nl.