yecc(3erl)

1yecc(3)                    Erlang Module Definition                    yecc(3)
2
3
4

NAME

6       yecc - LALR-1 Parser Generator
7

DESCRIPTION

9       An  LALR-1  parser  generator  for Erlang, similar to yacc. Takes a BNF
10       grammar definition as input, and produces Erlang code for a parser.
11
12       To understand this text, you also have to look at the  yacc  documenta‐
13       tion  in  the UNIX(TM) manual. This is most probably necessary in order
14       to understand the idea of a parser generator,  and  the  principle  and
15       problems of LALR parsing with finite look-ahead.
16

DATA TYPES

18       error_info() =
19           {erl_anno:location() | none,
20            module(),
21            ErrorDescriptor :: term()}
22
23              The  standard  error_info()  structure that is returned from all
24              I/O modules. ErrorDescriptor is formattable by format_error/1.
25

EXPORTS

27       file(FileName) -> yecc_ret()
28
29       file(Grammarfile, Options) -> yecc_ret()
30
31              Types:
32
33                 Grammarfile = file:filename()
34                 Options = Option | [Option]
35                 Option =
36                     {error_location, column | line} |
37                     {includefile, Includefile :: file:filename()} |
38                     {report_errors, boolean()} |
39                     {report_warnings, boolean()} |
40                     {report, boolean()} |
41                     {return_errors, boolean()} |
42                     {return_warnings, boolean()} |
43                     {return, boolean()} |
44                     {parserfile, Parserfile :: file:filename()} |
45                     {verbose, boolean()} |
46                     {warnings_as_errors, boolean()} |
47                     report_errors | report_warnings | report |  return_errors
48                 |
49                     return_warnings | return | verbose | warnings_as_errors
50                 yecc_ret() = ok_ret() | error_ret()
51                 ok_ret() =
52                     {ok, Parserfile :: file:filename()} |
53                     {ok, Parserfile :: file:filename(), warnings()}
54                 error_ret() =
55                     error | {error, Errors :: errors(), Warnings :: warnings()}
56                 errors() = [{file:filename(), [error_info()]}]
57                 warnings() = [{file:filename(), [error_info()]}]
58
59              Grammarfile  is  the file of declarations and grammar rules. Re‐
60              turns ok upon success, or error if there are errors.  An  Erlang
61              file  containing  the  parser is created if there are no errors.
62              The options are:
63
64                {includefile, Includefile}.:
65                  Indicates a customized prologue file which the user may want
66                  to  use  instead  of  the  default  file  lib/parsetools/in‐
67                  clude/yeccpre.hrl which is otherwise included at the  begin‐
68                  ning  of  the resulting parser file. N.B. The Includefile is
69                  included 'as is' in the parser file, so it must not  have  a
70                  module  declaration  of  its  own, and it should not be com‐
71                  piled. It must, however, contain the necessary export decla‐
72                  rations. The default is indicated by "".
73
74                {parserfile, Parserfile}.:
75                  Parserfile is the name of the file that will contain the Er‐
76                  lang parser code that is generated. The default ("")  is  to
77                  add  the  extension .erl to Grammarfile stripped of the .yrl
78                  extension.
79
80                {report_errors, boolean()}.:
81                  Causes errors to be printed as they occur. Default is true.
82
83                {report_warnings, boolean()}.:
84                  Causes warnings to be printed  as  they  occur.  Default  is
85                  true.
86
87                {report, boolean()}.:
88                  This is a short form for both report_errors and report_warn‐
89                  ings.
90
91                {return_errors, boolean()}.:
92                  If this flag is set, {error, Errors, Warnings}  is  returned
93                  when there are errors. Default is false.
94
95                {return_warnings, boolean()}.:
96                  If  this  flag is set, an extra field containing Warnings is
97                  added to the tuple returned upon success. Default is false.
98
99                {return, boolean()}.:
100                  This is a short form for both return_errors and return_warn‐
101                  ings.
102
103                {verbose, boolean()}. :
104                  Determines whether the parser generator should give full in‐
105                  formation about resolved and unresolved  parse  action  con‐
106                  flicts  (true), or only about those conflicts that prevent a
107                  parser from being generated from the input  grammar  (false,
108                  the default).
109
110                {warnings_as_errors, boolean()}:
111                  Causes warnings to be treated as errors.
112
113                {error_location, column | line}.:
114                  If  the value of this flag is line, the location of warnings
115                  and errors is a line number. If the value is column, the lo‐
116                  cation  includes  a line number and a column number. Default
117                  is column.
118
119              Any of the Boolean options can be set to  true  by  stating  the
120              name  of the option. For example, verbose is equivalent to {ver‐
121              bose, true}.
122
123              The value of the Parserfile option stripped of the  .erl  exten‐
124              sion  is used by Yecc as the module name of the generated parser
125              file.
126
127              Yecc will add the extension .yrl to the  Grammarfile  name,  the
128              extension  .hrl  to the Includefile name, and the extension .erl
129              to the Parserfile name, unless the extension is already there.
130
131       format_error(ErrorDescriptor) -> io_lib:chars()
132
133              Types:
134
135                 ErrorDescriptor = term()
136
137              Returns a descriptive string in English of an error  reason  Er‐
138              rorDescriptor returned by yecc:file/1,2. This function is mainly
139              used by the compiler invoking Yecc.
140

DEFAULT YECC OPTIONS

142       The (host operating system) environment  variable  ERL_COMPILER_OPTIONS
143       can be used to give default Yecc options. Its value must be a valid Er‐
144       lang term. If the value is a list, it is used as is. If  it  is  not  a
145       list, it is put into a list.
146
147       The list is appended to any options given to file/2.
148
149       The list can be retrieved with  compile:env_compiler_options/0.
150

PRE-PROCESSING

152       A  scanner  to pre-process the text (program, etc.) to be parsed is not
153       provided in the yecc module. The scanner serves as a  kind  of  lexicon
154       look-up routine. It is possible to write a grammar that uses only char‐
155       acter tokens as terminal symbols, thereby eliminating the  need  for  a
156       scanner, but this would make the parser larger and slower.
157
158       The  user  should implement a scanner that segments the input text, and
159       turns it into one or more lists of tokens. Each token should be a tuple
160       containing  information  about syntactic category, position in the text
161       (e.g. line number), and the actual terminal symbol found in  the  text:
162       {Category, Position, Symbol}.
163
164       If  a  terminal symbol is the only member of a category, and the symbol
165       name is identical to the category name, the token format may  be  {Sym‐
166       bol, Position}.
167
168       A  list  of  tokens  produced  by the scanner should end with a special
169       end_of_input tuple which the parser is looking for. The format of  this
170       tuple should be {Endsymbol, EndPosition}, where Endsymbol is an identi‐
171       fier that is distinguished from all the terminal and non-terminal cate‐
172       gories  of the syntax rules. The Endsymbol may be declared in the gram‐
173       mar file (see below).
174
175       The simplest case is to segment the input string into a list of identi‐
176       fiers  (atoms) and use those atoms both as categories and values of the
177       tokens. For example, the input string aaa bbb 777,  X  may  be  scanned
178       (tokenized) as:
179
180       [{aaa, 1}, {bbb, 1}, {777, 1}, {',' , 1}, {'X', 1},
181        {'$end', 1}].
182
183       This  assumes  that  this is the first line of the input text, and that
184       '$end' is the distinguished end_of_input symbol.
185
186       The Erlang scanner in the io module can be used  as  a  starting  point
187       when  writing  a  new scanner. Study yeccscan.erl in order to see how a
188       filter can be added on top of io:scan_erl_form/3 to provide  a  scanner
189       for Yecc that tokenizes grammar files before parsing them with the Yecc
190       parser. A more general approach to scanner implementation is to  use  a
191       scanner  generator.  A scanner generator in Erlang called leex is under
192       development.
193

GRAMMAR DEFINITION FORMAT

195       Erlang style comments, starting with a  '%',  are  allowed  in  grammar
196       files.
197
198       Each declaration or rule ends with a dot (the character '.').
199
200       The  grammar  starts with an optional header section. The header is put
201       first in the generated file, before the module declaration. The purpose
202       of the header is to provide a means to make the documentation generated
203       by EDoc look nicer. Each header  line  should  be  enclosed  in  double
204       quotes, and newlines will be inserted between the lines. For example:
205
206       Header "%% Copyright (C)"
207       "%% @private"
208       "%% @Author John".
209
210       Next  comes  a  declaration of the nonterminal categories to be used in
211       the rules. For example:
212
213       Nonterminals sentence nounphrase verbphrase.
214
215       A non-terminal category can be used at the left hand side  (=  lhs,  or
216       head)  of  a grammar rule. It can also appear at the right hand side of
217       rules.
218
219       Next comes a declaration of the terminal categories, which are the cat‐
220       egories of tokens produced by the scanner. For example:
221
222       Terminals article adjective noun verb.
223
224       Terminal  categories may only appear in the right hand sides (= rhs) of
225       grammar rules.
226
227       Next comes a declaration of the rootsymbol, or start  category  of  the
228       grammar. For example:
229
230       Rootsymbol sentence.
231
232       This symbol should appear in the lhs of at least one grammar rule. This
233       is the most general syntactic category which the parser ultimately will
234       parse every input string into.
235
236       After  the  rootsymbol declaration comes an optional declaration of the
237       end_of_input symbol that your scanner is expected to use. For example:
238
239       Endsymbol '$end'.
240
241       Next comes one or more declarations of operator precedences, if needed.
242       These  are  used to resolve shift/reduce conflicts (see yacc documenta‐
243       tion).
244
245       Examples of operator declarations:
246
247       Right 100 '='.
248       Nonassoc 200 '==' '=/='.
249       Left 300 '+'.
250       Left 400 '*'.
251       Unary 500 '-'.
252
253       These declarations mean that '=' is defined as a right associative  bi‐
254       nary operator with precedence 100, '==' and '=/=' are operators with no
255       associativity, '+' and '*' are left associative binary operators, where
256       '*' takes precedence over '+' (the normal case), and '-' is a unary op‐
257       erator of higher precedence than '*'. The fact that '==' has  no  asso‐
258       ciativity  means  that  an  expression like a == b == c is considered a
259       syntax error.
260
261       Certain rules are assigned precedence: each rule  gets  its  precedence
262       from  the  last terminal symbol mentioned in the right hand side of the
263       rule. It is also possible to declare precedence for non-terminals, "one
264       level  up".  This is practical when an operator is overloaded (see also
265       example 3 below).
266
267       Next come the grammar rules. Each rule has the general form
268
269       Left_hand_side -> Right_hand_side : Associated_code.
270
271       The left hand side is a non-terminal category. The right hand side is a
272       sequence  of  one  or more non-terminal or terminal symbols with spaces
273       between. The associated code is a sequence of zero or more  Erlang  ex‐
274       pressions  (with  commas  ',' as separators). If the associated code is
275       empty, the separating colon ':' is also omitted. A final dot marks  the
276       end of the rule.
277
278       Symbols  such  as  '{', '.', etc., have to be enclosed in single quotes
279       when used as terminal or non-terminal symbols in grammar rules. The use
280       of the symbols '$empty', '$end', and '$undefined' should be avoided.
281
282       The  last  part  of the grammar file is an optional section with Erlang
283       code (= function definitions) which is included 'as is' in the  result‐
284       ing  parser  file. This section must start with the pseudo declaration,
285       or key words
286
287       Erlang code.
288
289       No syntax rule definitions or other declarations may follow  this  sec‐
290       tion.  To  avoid conflicts with internal variables, do not use variable
291       names beginning with two underscore characters  ('__')  in  the  Erlang
292       code  in  this  section,  or in the code associated with the individual
293       syntax rules.
294
295       The optional expect declaration can be placed anywhere before the  last
296       optional section with Erlang code. It is used for suppressing the warn‐
297       ing about conflicts that is ordinarily given if the grammar is  ambigu‐
298       ous. An example:
299
300       Expect 2.
301
302       The  warning  is  given if the number of shift/reduce conflicts differs
303       from 2, or if there are reduce/reduce conflicts.
304

EXAMPLES

306       A grammar to parse list expressions (with empty associated code):
307
308       Nonterminals list elements element.
309       Terminals atom '(' ')'.
310       Rootsymbol list.
311       list -> '(' ')'.
312       list -> '(' elements ')'.
313       elements -> element.
314       elements -> element elements.
315       element -> atom.
316       element -> list.
317
318       This grammar can be used to generate a parser which parses list expres‐
319       sions, such as (), (a), (peter charles), (a (b c) d (())), ... provided
320       that your scanner tokenizes, for example, the input (peter charles)  as
321       follows:
322
323       [{'(', 1} , {atom, 1, peter}, {atom, 1, charles}, {')', 1},
324        {'$end', 1}]
325
326       When  a grammar rule is used by the parser to parse (part of) the input
327       string as a grammatical phrase, the associated code is  evaluated,  and
328       the  value  of  the  last  expression  becomes  the value of the parsed
329       phrase. This value may be used by the parser later to build  structures
330       that  are  values  of  higher  phrases of which the current phrase is a
331       part. The values initially associated with terminal  category  phrases,
332       i.e. input tokens, are the token tuples themselves.
333
334       Below  is  an example of the grammar above with structure building code
335       added:
336
337       list -> '(' ')' : nil.
338       list -> '(' elements ')' : '$2'.
339       elements -> element : {cons, '$1', nil}.
340       elements -> element elements : {cons, '$1', '$2'}.
341       element -> atom : '$1'.
342       element -> list : '$1'.
343
344       With this code added to the grammar rules, the parser produces the fol‐
345       lowing  value  (structure) when parsing the input string (a b c).. This
346       still assumes that this was the first input line that the scanner  tok‐
347       enized:
348
349       {cons, {atom, 1, a,} {cons, {atom, 1, b},
350                                   {cons, {atom, 1, c}, nil}}}
351
352       The  associated  code  contains pseudo variables '$1', '$2', '$3', etc.
353       which refer to (are bound to) the values associated previously  by  the
354       parser  with the symbols of the right hand side of the rule. When these
355       symbols are terminal categories, the values are token tuples of the in‐
356       put string (see above).
357
358       The associated code may not only be used to build structures associated
359       with phrases, but may also be used for syntactic  and  semantic  tests,
360       printout  actions  (for  example  for tracing), etc. during the parsing
361       process. Since tokens contain positional (line number) information,  it
362       is  possible  to  produce error messages which contain line numbers. If
363       there is no associated code after the right hand side of the rule,  the
364       value '$undefined' is associated with the phrase.
365
366       The  right  hand side of a grammar rule may be empty. This is indicated
367       by using the special symbol '$empty' as  rhs.  Then  the  list  grammar
368       above may be simplified to:
369
370       list -> '(' elements ')' : '$2'.
371       elements -> element elements : {cons, '$1', '$2'}.
372       elements -> '$empty' : nil.
373       element -> atom : '$1'.
374       element -> list : '$1'.
375

GENERATING A PARSER

377       To call the parser generator, use the following command:
378
379       yecc:file(Grammarfile).
380
381       An  error  message from Yecc will be shown if the grammar is not of the
382       LALR type (for example too ambiguous). Shift/reduce conflicts  are  re‐
383       solved  in favor of shifting if there are no operator precedence decla‐
384       rations. Refer to the yacc documentation on the use of operator  prece‐
385       dence.
386
387       The  output  file  contains Erlang source code for a parser module with
388       module name equal to the Parserfile parameter. After  compilation,  the
389       parser  can  be called as follows (the module name is assumed to be my‐
390       parser):
391
392       myparser:parse(myscanner:scan(Inport))
393
394       The call format may be different if a customized prologue file has been
395       included  when  generating  the  parser  instead  of  the  default file
396       lib/parsetools/include/yeccpre.hrl.
397
398       With the standard prologue, this call will return either {ok,  Result},
399       where  Result  is  a structure that the Erlang code of the grammar file
400       has built, or {error, {Position, Module, Message}} if there was a  syn‐
401       tax error in the input.
402
403       Message  is  something  which may be converted into a string by calling
404       Module:format_error(Message) and printed with io:format/3.
405
406   Note:
407       By default, the parser that was generated will not print out error mes‐
408       sages  to  the screen. The user will have to do this either by printing
409       the returned error messages, or by inserting tests and  print  instruc‐
410       tions  in the Erlang code associated with the syntax rules of the gram‐
411       mar file.
412
413
414       It is also possible to make the parser ask for more input  tokens  when
415       needed if the following call format is used:
416
417       myparser:parse_and_scan({Function, Args})
418       myparser:parse_and_scan({Mod, Tokenizer, Args})
419
420       The tokenizer Function is either a fun or a tuple {Mod, Tokenizer}. The
421       call apply(Function, Args) or apply({Mod, Tokenizer}, Args) is executed
422       whenever a new token is needed. This, for example, makes it possible to
423       parse from a file, token by token.
424
425       The tokenizer used above has to be implemented so as to return  one  of
426       the following:
427
428       {ok, Tokens, EndPosition}
429       {eof, EndPosition}
430       {error, Error_description, EndPosition}
431
432       This  conforms  to  the format used by the scanner in the Erlang io li‐
433       brary module.
434
435       If  {eof,  EndPosition}  is   returned   immediately,   the   call   to
436       parse_and_scan/1  returns  {ok, eof}. If {eof, EndPosition} is returned
437       before the parser expects  end  of  input,  parse_and_scan/1  will,  of
438       course,  return an error message (see above). Otherwise {ok, Result} is
439       returned.
440

MORE EXAMPLES

442       1. A grammar for parsing infix arithmetic expressions into prefix nota‐
443       tion, without operator precedence:
444
445       Nonterminals E T F.
446       Terminals '+' '*' '(' ')' number.
447       Rootsymbol E.
448       E -> E '+' T: {'$2', '$1', '$3'}.
449       E -> T : '$1'.
450       T -> T '*' F: {'$2', '$1', '$3'}.
451       T -> F : '$1'.
452       F -> '(' E ')' : '$2'.
453       F -> number : '$1'.
454
455       2. The same with operator precedence becomes simpler:
456
457       Nonterminals E.
458       Terminals '+' '*' '(' ')' number.
459       Rootsymbol E.
460       Left 100 '+'.
461       Left 200 '*'.
462       E -> E '+' E : {'$2', '$1', '$3'}.
463       E -> E '*' E : {'$2', '$1', '$3'}.
464       E -> '(' E ')' : '$2'.
465       E -> number : '$1'.
466
467       3. An overloaded minus operator:
468
469       Nonterminals E uminus.
470       Terminals '*' '-' number.
471       Rootsymbol E.
472
473       Left 100 '-'.
474       Left 200 '*'.
475       Unary 300 uminus.
476
477       E -> E '-' E.
478       E -> E '*' E.
479       E -> uminus.
480       E -> number.
481
482       uminus -> '-' E.
483
484       4.  The  Yecc grammar that is used for parsing grammar files, including
485       itself:
486
487       Nonterminals
488       grammar declaration rule head symbol symbols attached_code
489       token tokens.
490       Terminals
491       atom float integer reserved_symbol reserved_word string char var
492       Rootsymbol grammar.
493       Endsymbol '$end'.
494       grammar -> declaration : '$1'.
495       grammar -> rule : '$1'.
496       declaration -> symbol symbols dot: {'$1', '$2'}.
497       rule -> head '->' symbols attached_code dot: {rule, ['$1' | '$3'],
498               '$4'}.
499       head -> symbol : '$1'.
500       symbols -> symbol : ['$1'].
501       symbols -> symbol symbols : ['$1' | '$2'].
502       attached_code -> ':' tokens : {erlang_code, '$2'}.
503       attached_code -> '$empty' : {erlang_code,
504                        [{atom, 0, '$undefined'}]}.
505       tokens -> token : ['$1'].
506       tokens -> token tokens : ['$1' | '$2'].
507       symbol -> var : value_of('$1').
508       symbol -> atom : value_of('$1').
509       symbol -> integer : value_of('$1').
510       symbol -> reserved_word : value_of('$1').
511       token -> var : '$1'.
512       token -> atom : '$1'.
513       token -> float : '$1'.
514       token -> integer : '$1'.
515       token -> string : '$1'.
516       token -> char : '$1'.
517       token -> reserved_symbol : {value_of('$1'), line_of('$1')}.
518       token -> reserved_word : {value_of('$1'), line_of('$1')}.
519       token -> '->' : {'->', line_of('$1')}.
520       token -> ':' : {':', line_of('$1')}.
521       Erlang code.
522       value_of(Token) ->
523           element(3, Token).
524       line_of(Token) ->
525           element(2, Token).
526
527   Note:
528       The symbols '->', and ':' have to be treated in a special way, as  they
529       are  meta  symbols of the grammar notation, as well as terminal symbols
530       of the Yecc grammar.
531
532
533       5. The file erl_parse.yrl in the lib/stdlib/src directory contains  the
534       grammar for Erlang.
535
536   Note:
537       Syntactic tests are used in the code associated with some rules, and an
538       error is thrown (and caught by the generated parser to produce an error
539       message) when a test fails. The same effect can be achieved with a call
540       to return_error(ErrorPosition, Message_string), which is defined in the
541       yeccpre.hrl default header file.
542
543

FILES

545       lib/parsetools/include/yeccpre.hrl
546