%% Wren tokenier and parser from chapter 2 of the formal semantics %% text (Slonneger and Kurtz 1995) with customization needed for the %% full version and including comments and other changes for readability. %% %% Tom O'Hara %% CS 571 %% %% Slonneger, Kenneth and Barry L. Kurtz (1995), Formal Syntax and Semantics %% of Programming Languages, Reading, MA: Addison-Wesley. %% %% TODO: %% - make sure that all of the BNF is included in the code comments %%------------------------------------------------------------------------ %% Scanner %% %% Tokenizes the input from the terminal or a file and produces a list %% of tokens. %%........................................................................ %% character type tests %% %% Predicate Test: Is the character ... %% lower(+Char) lowercase %% upper(+Char) uppercase %% digit(+Char) a number %% space(+Char) the space character %% tabch(+Char) the tab character %% period(+Char) '.' (i.e., full-stop) %% slash(+Char) forward slash ('/') %% endline(+Char) newline %% endfile(+Char) end-of-file indicator %% lower(Char) :- 97 =< Char, Char =< 122. % a-z upper(Char) :- 65 =< Char, Char =< 90. % A-Z digit(Char) :- 48 =< Char, Char =< 57. % 0-9 space(32). tabch(9). period(46). slash(47). endline(10). endfile(26). endfile(-1). whitespace(Char) :- space(Char) ; tabch(Char) ; endline(Char). idchar(Char) :- lower(Char) ; digit(Char). %%........................................................................ %% Scanner proper %% scan(-TokenList): returns list of tokens from input %% scan([Token | TokenList]) :- tab(4), % output 4 spaces getch(Char), gettoken(Char, Token, NextChar), !, % don't backtrack if subsequent errors found restprog(Token, NextChar, TokenList). %% getch(-Char): returns the next character from the input and outputs %% it to the terminal. %% %% When the end of a line is reached, a newline is output followed by a %% tab. When the end of file is reached just a newline is output. %% getch(Char) :- get0(Char), (endline(Char), nl, tab(4) ; endfile(Char), nl ; put(Char)). %% restprog(+LastToken, +NextChar, -TokenList): given the previous token %% and the next character in the input, return a list of tokens for the %% remaining portion of the program. %% restprog(eop, _Char, []). % end of file reached with previous character restprog(_LastToken, Char, [Token | TokenList]) :- gettoken(Char, Token, NextChar), !, % don't backtrack if subsequent errors found restprog(Token, NextChar, TokenList). %% single(+CharCode, -Token): determines whether the character represents %% a token by itself and if so returns the token symbol %% single(40, lparen). single(41, rparen). single(42, times). single(43, plus). single(44, comma). single(45, minus). single(47, divides). single(59, semicolon). single(61, equal). %% double(+CharCode, -Token): determines whether the character can either %% represent the start of a double character token or token by itself. %% If so, the symbol for the token is returned. %% double(58, colon). double(60, less). double(62, grtr). %% pair(+Char1Code, +Char2Code, -Symbol): determines whether the two %% characters form a token and if so returns the symbol. %% %% NOTE: pair(Char1, _Char2, _DoubleSymbol) <=> single(Char1, _SingleSymbol) %% pair(58, 61, assign). % := pair(60, 61, lteq). % <= pair(60, 62, neq). % <> pair(62, 61, gteq). % >= %% reswd(+Identifier): determines whether the identifier is a reserved word %% reswd(and). reswd(begin). reswd(boolean). reswd(do). reswd(else). reswd(end). reswd(false). reswd(if). reswd(integer). reswd(is). reswd(not). reswd(or). reswd(program). reswd(read). reswd(skip). reswd(then). reswd(true). reswd(var). reswd(while). reswd(write). %%........................................................................ %% Token definition %% gettoken(+LookAhead, -Token, -NextChar): given the current lookahead %% character, returns the next token and lookahead character %% %% gettoken(+LookAhead, -num(Number), -NextChar): case for numeric tokens %% gettoken(LookAhead, num(Number), NextChar) :- digit(LookAhead), getch(CurrentChar), restnum(CurrentChar, CharList, NextChar), name(Number, [LookAhead | CharList]). %% gettoken(+LookAhead, -ide(Name), -NextChar): case for alphanumeric tokens %% gettoken(+LookAhead, -Name, -NextChar): case for reserved words %% gettoken(LookAhead, Token, NextChar) :- lower(LookAhead), getch(CurrentChar), restid(CurrentChar, CharList, NextChar), name(ID, [LookAhead | CharList]), (reswd(ID), Token = ID ; Token = ide(ID)). %% gettoken(+LookAhead, -Symbol, -NextChar): cases for special symbols %% gettoken(LookAhead, Token , NextChar) :- single(LookAhead, Token), getch(NextChar). %% gettoken(LookAhead, Token, NextChar) :- double(LookAhead, SimpleToken), getch(CurrentChar), (pair(LookAhead, CurrentChar, Token), getch(NextChar) ; Token = SimpleToken, NextChar = CurrentChar). %% special case handling for end-of-file and whitespace %% gettoken(LookAhead, eop, 0) :- endfile(LookAhead). %% gettoken(LookAhead, Token, NextChar) :- whitespace(LookAhead), getch(CurrentChar), gettoken(CurrentChar, Token, NextChar). %% Display error message if illegal character encountered %% NOTE: program execution is halted %% gettoken(LookAhead, _Token, _NextChar) :- nl, write('Illegal character: '), put(LookAhead), nl, fail. %% restnum(+LookAhead, -CharList, -NextChar): given that the current lookahead %% character is a digit, this returns the char list for the token, along with %% the next lookahead character %% restnum(LookAhead, [LookAhead | CharList], NextChar) :- digit(LookAhead), getch(CurrentChar), restnum(CurrentChar, CharList, NextChar). restnum(LookAhead, [], LookAhead). % end of number if not a digit %% restid(+LookAhead, -CharList, -NextChar): given that the current lookahead %% character is valid for ID's, this returns the char list for the token, along %% with the next lookahead character %% restid(LookAhead, [LookAhead | CharList], NextChar) :- idchar(LookAhead), getch(CurrentChar), restid(CurrentChar, CharList, NextChar). restid(LookAhead, [], LookAhead). % end of identifier if not an ID char %%........................................................................ %% Scanner interface %% go_scan(): get the name of the Wren program and attempt to tokenize it. %% The result is output to standard output. %% go_scan :- nl, write('>>> Scanning Wren <<<'), nl, nl, write('Enter name of source file: '), nl, getfilename(FileName), nl, go_scan_aux(FileName). %% go_scan_aux(+FileName): tokenize the Wren program in the specified file %% go_scan_aux(FileName) :- see(FileName), scan(Tokens), seen, write('Scan successful'), nl, nl, write(Tokens), nl. %% getfilename(-FileName): return a symvol for a filename taken %% from standard input %% getfilename(FileName) :- get0(LookAhead), restfilename(LookAhead, CharList), name(FileName, CharList). %% restfilename(+LookAhead, -CharList): given the current lookahead %% character, return a list of characters forming a filename %% restfilename(LookAhead, [LookAhead | CharList]) :- filechar(LookAhead), get0(CurrentChar), restfilename(CurrentChar, CharList). restfilename(_LookAhead, []). %% filechar(+Char): indicates whether the character can be part of a filename %% (i.e., [a-zA-Z0-9_./]) %% filechar(Char) :- lower(Char) ; upper(Char) ; digit(Char) ; period(Char) ; slash(Char). %%------------------------------------------------------------------------ %% Parser %% %% DCG parser for the Wren language as defined in Figure 1.8 of the %% formal semantics text (Sonneger and Kurtz 1995). %% %% ::= program is %% program(SyntaxTree) --> [program], [ide(_ID)], [is], wren_block(SyntaxTree). %% ::= begin end %% wren_block(prog(Declarations, Cmds)) --> decs(Declarations), [begin], cmds(Cmds), [end]. %%........................................................................ %% Declaration section %% ::= e | %% ::= var : ; decs(Declarations) --> dec(Declaration), restdecs(Declaration, Declarations). decs([]) --> []. restdecs(Declaration, [Declaration | Declarations]) --> decs(Declarations). restdecs(Declaration, [Declaration]) --> []. dec(dec(Type, Vars)) --> [var], varlist(Vars), [colon], type(Type), [semicolon]. %% ::= | , %% ::= varlist([Var | Vars]) --> [ide(Var)], restvarlist(Vars). restvarlist(Vars) --> [comma], varlist(Vars). restvarlist([]) --> []. %% ::= integer | boolan type(integer) --> [integer]. type(boolean) --> [boolean]. %% Note: the following parts of the BNF are handled by the tokenizer: %% %% ::= | | %% ::= a | b | c | d | e | f | g | h | i | j | k | l | m %% | n | o | p | q | r | s | t | u | v | w | x | y | z %% ::= | %% ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 %% %%........................................................................ %% Commands %% ::= | ; %% cmds(Cmds) --> command(Cmd), restcmds(Cmd, Cmds). restcmds(Cmd, [Cmd | Cmds]) --> [semicolon], cmds(Cmds). restcmds(Cmd, [Cmd]) --> []. %% ::= := becomes %% command(assign(Var, Expr)) --> [ide(Var)], [assign], expr(Expr). %% := skip %% command(skip) --> [skip]. %% := read | write command(read(Var)) --> [read], [ide(Var)]. command(write(Expr)) --> [write], intexpr(Expr). %% ::= while do end while %% command(while(Test, Body)) --> [while], bool_expr(Test), [do], cmds(Body), [end, while]. %% ::= if then end if %% | if then else end if command(Cmd) --> [if], bool_expr(Test), [then], cmds(Then), restif(Test, Then, Cmd). restif(Test, Then, if(Test, Then, Else)) --> [else], cmds(Else), [end], [if]. restif(Test, Then, if(Test, Then)) --> [end], [if]. %%........................................................................ %% Expressions %% ::= | expr(Expr) --> intexpr(Expr). expr(Expr) --> bool_expr(Expr). %% ::= | %% %% result: Term -or- exp(Op, Expr1, Expr2) intexpr(Expr) --> term(Term), restintexpr(Term, Expr). restintexpr(Term1, Expr) --> weakop(Operator), term(Term2), restintexpr(exp(Operator, Term1, Term2), Expr). restintexpr(Expr, Expr) --> []. %% ::= | %% %% ::= + | - %% ::= * | / term(Term) --> element(Element), restterm(Element, Term). restterm(Element1, Term) --> strongop(Operator), element(Element2), restterm(exp(Operator, Element1, Element2), Term). restterm(Term, Term) --> []. weakop(plus) --> [plus]. weakop(minus) --> [minus]. strongop(times) --> [times]. strongop(divides) --> [divides]. %% ::= | | ( element(num(Number)) --> [num(Number)]. element(ide(ID)) --> [ide(ID)]. element(Expr) --> [lparen], intexpr(Expr), [rparen]. element(minus(Expr)) --> [minus], element(Expr). %% ::= %% ::= <= | < | = | > | >= | <> comparison(bexp(Relation, Expr1, Expr2)) --> intexpr(Expr1), rel(Relation), intexpr(Expr2). rel(equal) --> [equal]. rel(neq) --> [neq]. rel(less) --> [less]. rel(grtr) --> [grtr]. rel(gteq) --> [gteq]. rel(lteq) --> [lteq]. %% ::= | %% ::= | %% %% ::= or %% ::= and %% %% where bool is short for boolean %% %% result: Term -or- bexp(Op, Expr1, Expr2) bool_expr(Expr) --> bool_term(Term), rest_bool_expr(Term, Expr). rest_bool_expr(Term1, Expr) --> weak_bool_op(Operator), bool_term(Term2), rest_bool_expr(bexp(Operator, Term1, Term2), Expr). rest_bool_expr(Expr, Expr) --> []. bool_term(Term) --> bool_element(Element), rest_bool_term(Element, Term). rest_bool_term(Element1, Term) --> strong_bool_op(Operator), bool_element(Element2), rest_bool_term(bexp(Operator, Element1, Element2), Term). rest_bool_term(Term, Term) --> []. weak_bool_op(or) --> [or]. strong_bool_op(and) --> [and]. %% ::= true | false | | %% | not ( ) | ( ) bool_element(false) --> [false]. bool_element(true) --> [true]. bool_element(ide(Var)) --> [ide(Var)]. bool_element(Comparison) --> comparison(Comparison). bool_element(bnot(Expression)) --> [not], [lparen], bool_expr(Expression), [rparen]. bool_element(Expression) --> [lparen], bool_expr(Expression), [rparen]. %%........................................................................ %% Parser interface %% go_parse(): invoke the Wren scanner and parser %% go_parse :- nl, write('>>> Interpreting: Wren <<<'), nl, nl, write('Enter name of source file: '), nl, getfilename(FileName), nl, go_parse_aux(FileName). %% go_parse_aux(+FileName): scan and parse the specified Wren program %% go_parse_aux(FileName) :- see(FileName), scan(Tokens), seen, write('Scan successful'), nl, write(Tokens), nl, nl, program(SyntaxTree, Tokens, [eop]), write('Parse successful'), nl, write(SyntaxTree), nl, nl.