EECE 571F= Domain-Specific Languages  
  
This is a page from yet
another great ECE, UBC subject.
[ Home | Assignments | Lectures |
DSL rules | Articles | Resources ]

Parsing

Lectures:
Old: 1 | 2 | 3 | 4 | 5
New: DCGnums | Parsing | Meta-prolog | Faster1 | Faster2 | Rand-nums
Not done: Abduction | Stochastic abs

g1=


digraph G {
     graph [rankdir = LR ];
     something we don't care about;
     graph [lp = "229,
	330"];
}

parse.out=


--| chars |------
 	100
	105
	103
	114
	97
	112
	104
	32
	71
	32
	123
	10
	32
	32
	32
	32
	32
	103
	114
	etc.,


--| tokens |------
 	graph
	$sopen
	rankdir
	$equal
	LR
	$sclose
	$end
	something
	we
	don't
	care
	about
	$end
	graph
	$sopen
	lp
	$equal
	$quote
	229
	$comma
	330
	$quote
	$sclose
	$end

--| statements |------
 	rankdir=LR
	lp=[229, 330]

parse.pl=


% parse.pl

% finding statements of the form X=Y
% within a complex string

:- [format].

demo :- tell('parse.out'), ignore(demo1), told.

demo1 :-
	chars(g1,Chars),             % seperate file I/O...
	tokens(Tokens,Chars,[]),     % .. from tokenisation
	parse(Statements,Tokens,[]), % .. from parsing
	!,  % <--- block backtracking- otherwise DCGs
            % can go awol.
	format('\n--| chars |--\n ~20L\tetc.,\n\n',[Chars]),	
	format('\n--| tokens |--\n ~L',           [Tokens]),
	format('\n--| statements |--\n ~L',   [Statements]).

        % note strange convention of DCGs:
        % inputs string is second last arg
        % outputs are the other args

% file i/o
chars(F,List) :- see(F), chars1(List), !, seen.
chars(_,[])   :- seen. % catch predicate if chars1 fails.
                       % never leave files open!

chars1(L) :-
	get0(X), % <-- primitive character reader
	chars2(X,L).

chars2(-1,[]) :- !.
chars2(H, [H|T]) :-
	get0(Next),
	chars2(Next,T).

/* understanding text strings is really three problems:
 1) tokenization - 
 2) parsing
 3) interpretation

Tokenization:

The individual atomic expressions of a language
are (usually) written down as sequences of simpler
characters. The text is ultimately then a
character stream. The task of a tokenizer is at
least to segment this character sequence turning
it into a token stream. The tokens (the fancy word
in this context for 'word' or 'atomic expression')
are often also assigned a category. We will support
two categories: $X will denote special words and X
(without a "$") will denote everything else.
*/

% note that we can "eat" more that one character in a leaf dcg
special(digraph) --> "digraph".
special(equal)   --> "=".
special(comma)   --> ",".
special(end)     --> ";".
special(quote)   --> [34]. % 34= "
% curly brackets
special(copen)   --> "{".
special(cclose)  --> "}".
% round brackets
special(ropen)   --> "(".
special(rclose)  --> ")".
% square brackets
special(sopen)   --> "[".
special(sclose)  --> "]".

/*
Once tokenized, the input stream is smaller, more abstract,
makes writing and debugging a parser easier.
*/

%-------------------------------------
% parser- assuming file written
parse(S)  --> statements(S).

% zero or more statements,
% and sometimes we collect information
% from them
statements([]) --> [].
statements([One|Rest]) -->
	[graph, $sopen], % statements beginning
	                 % with keyword "graph",
	                 % we collect
	assignment(One),
	[$sclose,$end],
	statements(Rest).
statements(Rest) -->     % all other statements
	                 % are dull and we will
	                 % ignore them
	dull,
	[$end],
	statements(Rest).

assignment(X=Y) --> [X,$equal], values(Y).

% values are either quotes lists...
values(Y) --> [$quote], items(Y), [$quote].
% ... or single items
values(Y) --> [Y].

% items are connect to other items via commas
items([One|Rest]) --> [One,$comma],items(Rest).
items([One])      --> [One].

% dull stuff- use carefully.
dull --> [].
dull --> [_], dull.

%-------------------------------------
% here's a tokenizer for graphviz files.
tokens(T) -->
	header,
	tokens1(T),  % ignore header and footer
	footer.
	
header -->
	whites,      % skip over whitespace
	"digraph",   % read, and ignore, a keyword
	whites,
	"G",
	whites,
	"{".

footer -->
	whites,      
	"}",
	whites.

% program body holds zero or more tokens
tokens1([]) --> [].
tokens1([One|Rest]) --> token(One), tokens1(Rest).

% tokens may have leading white space
token(X) --> whites, token1(X).

% two token types:
% type 1: specials (denoted with a leading "$"
token1($X) --> special(X).

% type 2: everything else
% note the use of predicate ordering to control
% the parse- hence, no backtracking please outside
% the call to a DCG parser.
token1(X)-->  blacks(X). % reads all non-white,
                          % non-special characaters

% spin down the input string to the first
% non-white space thing.

% note: good black to write a "comment skipper"
whites([],[]) :- !.
whites([H|T],Out) :-
	white(H),
	!,
	whites(T,Out).
whites(L,L).

white(H) :- space([H],[]).

space      --> " " | tabb| newline.
tabb       --> [9].
newline    --> [10].

% spin down the input string to the first
% non-white space or non-special thing.
blacks(X,L0,L) :-
	blacks1(Y,L0,L),
	name(X,Y). %<-- insert "num" here

blacks1([],[],[]) :- !.
blacks1([],L,L) :-
	% tricky stuff- if the NEXT thing
	% is a special, stop here and
	% return the list, unchanged
	special(_,L,_),!.
blacks1([H|Blacks],[H|T],Rest) :-
	\+ white(H),
	!,
	blacks1(Blacks,T,Rest).
blacks1([],L,L).

% phew- thank heavens that we don't need to do
% this everytime we write a DSL- for languages
% we can define with infix,prefix, postfix operators,
% prolog's "read" predicate does all this for us.




format.pl=


% format.pl

%--------------------------------------
% stuff to simplify printing clauses.
% swi-prolog lets a programmer customize
% the format statement.

% FIRST, a predicate of arity 2 is registerred
%        next to some letter
:- format_predicate('P',p(_,_)).

% SECOND, write the predicate.
p(default,X) :- !, p(0,X).
p(_,(X :- true)) :- !, format('~p.\n',X).
p(_,(X :- Y   )) :- !, portray_clause((X :- Y)).
p(N,[H|T]      )  :- !, not(not((numbervars([H|T],N,_),
	                     format('~p',[[H|T]])))).
p(N,X          ) :- not(not((numbervars(X,N,_),
	                     format('~p',X)))).

%--------------------------------------
% stuff to simplify right justifying text

% FIRST:
:- format_predicate('>',padChars(_,_)).

% SECOND:
padChars(default,A) :-
	padChars(5,A).
% the first arg "S" is the optional argument
% someone may have given with the "~" command
padChars(S,A) :-
	writeThing(A,Thing,N),
	Pad is S - N,
	% standard trick to emulate
	% for(i=1;i<=N;i++) { doThis }
	forall(between(1,Pad,_),put(32)),
	write(Thing).

writeThing(X,S,L) :-
	% sformat returns the string in
	% the first arg
	sformat(S,'~w',[X]), string_length(S,L).

%--------------------------------------
% stuff to simplify left justifying text

% FIRST:
:- format_predicate('<',charsPad(_,_)).

% SECOND:
charsPad(default,A) :- charsPad(5,A).
charsPad(S,A) :-
	writeThing(A,Thing,N),
	atom_length(A,N),
	Pad is S - N,
	write(Thing),
	forall(between(1,Pad,_),put(32)).

%--------------------------------------
% stuff to simplify printing N twiddles,
% scaled to some factor

% FIRST:
:- format_predicate('S',twiddle(_,_)).

% SECOND:
twiddle(default,A) :- twiddle(25,A).
twiddle(W,N) :-
	N1 is round(N/W),
	forall(between(1,N1,_),put(126)).

%--------------------------------------
% stuff to simplify printing lists
% scaled to some factor

% FIRST:
:- format_predicate('L',printL(_,_)).

% SECOND:
printL(default,List) :- printL(10^6,List). 
printL(TooLong,List) :-
	forall((nth1(Pos,List,Item),
	        Pos < TooLong),
	       format('\t~w\n',Item)).




Not © Tim Menzies, 2001
Share and enjoy- information wants to be free.
But if you take anything from this site,
please credit tim@menzies.com.