yecc(3erl)

1yecc(3)                    Erlang Module Definition                    yecc(3)
2
3
4

NAME

6       yecc - LALR-1 Parser Generator
7

DESCRIPTION

9       An  LALR-1  parser  generator  for Erlang, similar to yacc. Takes a BNF
10       grammar definition as input, and produces Erlang code for a parser.
11
12       To understand this text, you also have to look at the  yacc  documenta‐
13       tion  in  the UNIX(TM) manual. This is most probably necessary in order
14       to understand the idea of a parser generator,  and  the  principle  and
15       problems of LALR parsing with finite look-ahead.
16

DATA TYPES

18       error_info() =
19           {erl_anno:location() | none,
20            module(),
21            ErrorDescriptor :: term()}
22
23              The  standard  error_info()  structure that is returned from all
24              I/O modules. ErrorDescriptor is formattable by format_error/1.
25

EXPORTS

27       file(FileName) -> yecc_ret()
28
29       file(Grammarfile, Options) -> yecc_ret()
30
31              Types:
32
33                 Grammarfile = file:filename()
34                 Options = Option | [Option]
35                 Option =
36                     {error_location, column | line} |
37                     {includefile, Includefile :: file:filename()} |
38                     {report_errors, boolean()} |
39                     {report_warnings, boolean()} |
40                     {report, boolean()} |
41                     {return_errors, boolean()} |
42                     {return_warnings, boolean()} |
43                     {return, boolean()} |
44                     {parserfile, Parserfile :: file:filename()} |
45                     {verbose, boolean()} |
46                     {warnings_as_errors, boolean()} |
47                     {deterministic, boolean()} |
48                     report_errors | report_warnings | report |  return_errors
49                 |
50                     return_warnings | return | verbose | warnings_as_errors
51                 yecc_ret() = ok_ret() | error_ret()
52                 ok_ret() =
53                     {ok, Parserfile :: file:filename()} |
54                     {ok, Parserfile :: file:filename(), warnings()}
55                 error_ret() =
56                     error | {error, Errors :: errors(), Warnings :: warnings()}
57                 errors() = [{file:filename(), [error_info()]}]
58                 warnings() = [{file:filename(), [error_info()]}]
59
60              Grammarfile  is  the file of declarations and grammar rules. Re‐
61              turns ok upon success, or error if there are errors.  An  Erlang
62              file  containing  the  parser is created if there are no errors.
63              The options are:
64
65                {includefile, Includefile}.:
66                  Indicates a customized prologue file which the user may want
67                  to  use  instead  of  the  default  file  lib/parsetools/in‐
68                  clude/yeccpre.hrl which is otherwise included at the  begin‐
69                  ning  of  the resulting parser file. N.B. The Includefile is
70                  included 'as is' in the parser file, so it must not  have  a
71                  module  declaration  of  its  own, and it should not be com‐
72                  piled. It must, however, contain the necessary export decla‐
73                  rations. The default is indicated by "".
74
75                {parserfile, Parserfile}.:
76                  Parserfile is the name of the file that will contain the Er‐
77                  lang parser code that is generated. The default ("")  is  to
78                  add  the  extension .erl to Grammarfile stripped of the .yrl
79                  extension.
80
81                {report_errors, boolean()}.:
82                  Causes errors to be printed as they occur. Default is true.
83
84                {report_warnings, boolean()}.:
85                  Causes warnings to be printed  as  they  occur.  Default  is
86                  true.
87
88                {report, boolean()}.:
89                  This is a short form for both report_errors and report_warn‐
90                  ings.
91
92                {return_errors, boolean()}.:
93                  If this flag is set, {error, Errors, Warnings}  is  returned
94                  when there are errors. Default is false.
95
96                {return_warnings, boolean()}.:
97                  If  this  flag is set, an extra field containing Warnings is
98                  added to the tuple returned upon success. Default is false.
99
100                {return, boolean()}.:
101                  This is a short form for both return_errors and return_warn‐
102                  ings.
103
104                {verbose, boolean()}. :
105                  Determines whether the parser generator should give full in‐
106                  formation about resolved and unresolved  parse  action  con‐
107                  flicts  (true), or only about those conflicts that prevent a
108                  parser from being generated from the input  grammar  (false,
109                  the default).
110
111                {warnings_as_errors, boolean()}:
112                  Causes warnings to be treated as errors.
113
114                {error_location, column | line}.:
115                  If  the value of this flag is line, the location of warnings
116                  and errors is a line number. If the value is column, the lo‐
117                  cation  includes  a line number and a column number. Default
118                  is column.
119
120                {deterministic, boolean()}:
121                  Causes generated -file()  attributes  to  only  include  the
122                  basename of the file path.
123
124              Any  of  the  Boolean  options can be set to true by stating the
125              name of the option. For example, verbose is equivalent to  {ver‐
126              bose, true}.
127
128              The  value  of the Parserfile option stripped of the .erl exten‐
129              sion is used by Yecc as the module name of the generated  parser
130              file.
131
132              Yecc  will  add  the extension .yrl to the Grammarfile name, the
133              extension .hrl to the Includefile name, and the  extension  .erl
134              to the Parserfile name, unless the extension is already there.
135
136       format_error(ErrorDescriptor) -> io_lib:chars()
137
138              Types:
139
140                 ErrorDescriptor = term()
141
142              Returns  a  descriptive string in English of an error reason Er‐
143              rorDescriptor returned by yecc:file/1,2. This function is mainly
144              used by the compiler invoking Yecc.
145

DEFAULT YECC OPTIONS

147       The  (host  operating system) environment variable ERL_COMPILER_OPTIONS
148       can be used to give default Yecc options. Its value must be a valid Er‐
149       lang  term.  If  the  value is a list, it is used as is. If it is not a
150       list, it is put into a list.
151
152       The list is appended to any options given to file/2.
153
154       The list can be retrieved with  compile:env_compiler_options/0.
155

PRE-PROCESSING

157       A scanner to pre-process the text (program, etc.) to be parsed  is  not
158       provided  in  the  yecc module. The scanner serves as a kind of lexicon
159       look-up routine. It is possible to write a grammar that uses only char‐
160       acter  tokens  as  terminal symbols, thereby eliminating the need for a
161       scanner, but this would make the parser larger and slower.
162
163       The user should implement a scanner that segments the input  text,  and
164       turns it into one or more lists of tokens. Each token should be a tuple
165       containing information about syntactic category, position in  the  text
166       (e.g.  line  number), and the actual terminal symbol found in the text:
167       {Category, Position, Symbol}.
168
169       If a terminal symbol is the only member of a category, and  the  symbol
170       name  is  identical to the category name, the token format may be {Sym‐
171       bol, Position}.
172
173       A list of tokens produced by the scanner  should  end  with  a  special
174       end_of_input  tuple which the parser is looking for. The format of this
175       tuple should be {Endsymbol, EndPosition}, where Endsymbol is an identi‐
176       fier that is distinguished from all the terminal and non-terminal cate‐
177       gories of the syntax rules. The Endsymbol may be declared in the  gram‐
178       mar file (see below).
179
180       The simplest case is to segment the input string into a list of identi‐
181       fiers (atoms) and use those atoms both as categories and values of  the
182       tokens.  For  example,  the  input string aaa bbb 777, X may be scanned
183       (tokenized) as:
184
185       [{aaa, 1}, {bbb, 1}, {777, 1}, {',' , 1}, {'X', 1},
186        {'$end', 1}].
187
188       This assumes that this is the first line of the input  text,  and  that
189       '$end' is the distinguished end_of_input symbol.
190
191       The  Erlang  scanner  in  the io module can be used as a starting point
192       when writing a new scanner. Study yeccscan.erl in order to  see  how  a
193       filter  can  be added on top of io:scan_erl_form/3 to provide a scanner
194       for Yecc that tokenizes grammar files before parsing them with the Yecc
195       parser.  A  more general approach to scanner implementation is to use a
196       scanner generator. A scanner generator in Erlang called leex  is  under
197       development.
198

GRAMMAR DEFINITION FORMAT

200       Erlang  style  comments,  starting  with  a '%', are allowed in grammar
201       files.
202
203       Each declaration or rule ends with a dot (the character '.').
204
205       The grammar starts with an optional header section. The header  is  put
206       first in the generated file, before the module declaration. The purpose
207       of the header is to provide a means to make the documentation generated
208       by  EDoc  look  nicer.  Each  header  line should be enclosed in double
209       quotes, and newlines will be inserted between the lines. For example:
210
211       Header "%% Copyright (C)"
212       "%% @private"
213       "%% @Author John".
214
215       Next comes a declaration of the nonterminal categories to  be  used  in
216       the rules. For example:
217
218       Nonterminals sentence nounphrase verbphrase.
219
220       A  non-terminal  category  can be used at the left hand side (= lhs, or
221       head) of a grammar rule. It can also appear at the right hand  side  of
222       rules.
223
224       Next comes a declaration of the terminal categories, which are the cat‐
225       egories of tokens produced by the scanner. For example:
226
227       Terminals article adjective noun verb.
228
229       Terminal categories may only appear in the right hand sides (= rhs)  of
230       grammar rules.
231
232       Next  comes  a  declaration of the rootsymbol, or start category of the
233       grammar. For example:
234
235       Rootsymbol sentence.
236
237       This symbol should appear in the lhs of at least one grammar rule. This
238       is the most general syntactic category which the parser ultimately will
239       parse every input string into.
240
241       After the rootsymbol declaration comes an optional declaration  of  the
242       end_of_input symbol that your scanner is expected to use. For example:
243
244       Endsymbol '$end'.
245
246       Next comes one or more declarations of operator precedences, if needed.
247       These are used to resolve shift/reduce conflicts (see  yacc  documenta‐
248       tion).
249
250       Examples of operator declarations:
251
252       Right 100 '='.
253       Nonassoc 200 '==' '=/='.
254       Left 300 '+'.
255       Left 400 '*'.
256       Unary 500 '-'.
257
258       These  declarations mean that '=' is defined as a right associative bi‐
259       nary operator with precedence 100, '==' and '=/=' are operators with no
260       associativity, '+' and '*' are left associative binary operators, where
261       '*' takes precedence over '+' (the normal case), and '-' is a unary op‐
262       erator  of  higher precedence than '*'. The fact that '==' has no asso‐
263       ciativity means that an expression like a == b ==  c  is  considered  a
264       syntax error.
265
266       Certain  rules  are  assigned precedence: each rule gets its precedence
267       from the last terminal symbol mentioned in the right hand side  of  the
268       rule. It is also possible to declare precedence for non-terminals, "one
269       level up". This is practical when an operator is overloaded  (see  also
270       example 3 below).
271
272       Next come the grammar rules. Each rule has the general form
273
274       Left_hand_side -> Right_hand_side : Associated_code.
275
276       The left hand side is a non-terminal category. The right hand side is a
277       sequence of one or more non-terminal or terminal  symbols  with  spaces
278       between.  The  associated code is a sequence of zero or more Erlang ex‐
279       pressions (with commas ',' as separators). If the  associated  code  is
280       empty,  the separating colon ':' is also omitted. A final dot marks the
281       end of the rule.
282
283       Symbols such as '{', '.', etc., have to be enclosed  in  single  quotes
284       when used as terminal or non-terminal symbols in grammar rules. The use
285       of the symbols '$empty', '$end', and '$undefined' should be avoided.
286
287       The last part of the grammar file is an optional  section  with  Erlang
288       code  (= function definitions) which is included 'as is' in the result‐
289       ing parser file. This section must start with the  pseudo  declaration,
290       or key words
291
292       Erlang code.
293
294       No  syntax  rule definitions or other declarations may follow this sec‐
295       tion. To avoid conflicts with internal variables, do not  use  variable
296       names  beginning  with  two  underscore characters ('__') in the Erlang
297       code in this section, or in the code  associated  with  the  individual
298       syntax rules.
299
300       The  optional expect declaration can be placed anywhere before the last
301       optional section with Erlang code. It is used for suppressing the warn‐
302       ing  about conflicts that is ordinarily given if the grammar is ambigu‐
303       ous. An example:
304
305       Expect 2.
306
307       The warning is given if the number of  shift/reduce  conflicts  differs
308       from 2, or if there are reduce/reduce conflicts.
309

EXAMPLES

311       A grammar to parse list expressions (with empty associated code):
312
313       Nonterminals list elements element.
314       Terminals atom '(' ')'.
315       Rootsymbol list.
316       list -> '(' ')'.
317       list -> '(' elements ')'.
318       elements -> element.
319       elements -> element elements.
320       element -> atom.
321       element -> list.
322
323       This grammar can be used to generate a parser which parses list expres‐
324       sions, such as (), (a), (peter charles), (a (b c) d (())), ... provided
325       that  your scanner tokenizes, for example, the input (peter charles) as
326       follows:
327
328       [{'(', 1} , {atom, 1, peter}, {atom, 1, charles}, {')', 1},
329        {'$end', 1}]
330
331       When a grammar rule is used by the parser to parse (part of) the  input
332       string  as  a grammatical phrase, the associated code is evaluated, and
333       the value of the last  expression  becomes  the  value  of  the  parsed
334       phrase.  This value may be used by the parser later to build structures
335       that are values of higher phrases of which  the  current  phrase  is  a
336       part.  The  values initially associated with terminal category phrases,
337       i.e. input tokens, are the token tuples themselves.
338
339       Below is an example of the grammar above with structure  building  code
340       added:
341
342       list -> '(' ')' : nil.
343       list -> '(' elements ')' : '$2'.
344       elements -> element : {cons, '$1', nil}.
345       elements -> element elements : {cons, '$1', '$2'}.
346       element -> atom : '$1'.
347       element -> list : '$1'.
348
349       With this code added to the grammar rules, the parser produces the fol‐
350       lowing value (structure) when parsing the input string (a b  c)..  This
351       still  assumes that this was the first input line that the scanner tok‐
352       enized:
353
354       {cons, {atom, 1, a}, {cons, {atom, 1, b},
355                                   {cons, {atom, 1, c}, nil}}}
356
357       The associated code contains pseudo variables '$1',  '$2',  '$3',  etc.
358       which  refer  to (are bound to) the values associated previously by the
359       parser with the symbols of the right hand side of the rule. When  these
360       symbols are terminal categories, the values are token tuples of the in‐
361       put string (see above).
362
363       The associated code may not only be used to build structures associated
364       with  phrases,  but  may also be used for syntactic and semantic tests,
365       printout actions (for example for tracing),  etc.  during  the  parsing
366       process.  Since tokens contain positional (line number) information, it
367       is possible to produce error messages which contain  line  numbers.  If
368       there  is no associated code after the right hand side of the rule, the
369       value '$undefined' is associated with the phrase.
370
371       The right hand side of a grammar rule may be empty. This  is  indicated
372       by  using  the  special  symbol  '$empty' as rhs. Then the list grammar
373       above may be simplified to:
374
375       list -> '(' elements ')' : '$2'.
376       elements -> element elements : {cons, '$1', '$2'}.
377       elements -> '$empty' : nil.
378       element -> atom : '$1'.
379       element -> list : '$1'.
380

GENERATING A PARSER

382       To call the parser generator, use the following command:
383
384       yecc:file(Grammarfile).
385
386       An error message from Yecc will be shown if the grammar is not  of  the
387       LALR  type  (for example too ambiguous). Shift/reduce conflicts are re‐
388       solved in favor of shifting if there are no operator precedence  decla‐
389       rations.  Refer to the yacc documentation on the use of operator prece‐
390       dence.
391
392       The output file contains Erlang source code for a  parser  module  with
393       module  name  equal to the Parserfile parameter. After compilation, the
394       parser can be called as follows (the module name is assumed to  be  my‐
395       parser):
396
397       myparser:parse(myscanner:scan(Inport))
398
399       The call format may be different if a customized prologue file has been
400       included when  generating  the  parser  instead  of  the  default  file
401       lib/parsetools/include/yeccpre.hrl.
402
403       With  the standard prologue, this call will return either {ok, Result},
404       where Result is a structure that the Erlang code of  the  grammar  file
405       has  built, or {error, {Position, Module, Message}} if there was a syn‐
406       tax error in the input.
407
408       Message is something which may be converted into a  string  by  calling
409       Module:format_error(Message) and printed with io:format/3.
410
411   Note:
412       By default, the parser that was generated will not print out error mes‐
413       sages to the screen. The user will have to do this either  by  printing
414       the  returned  error messages, or by inserting tests and print instruc‐
415       tions in the Erlang code associated with the syntax rules of the  gram‐
416       mar file.
417
418
419       It  is  also possible to make the parser ask for more input tokens when
420       needed if the following call format is used:
421
422       myparser:parse_and_scan({Function, Args})
423       myparser:parse_and_scan({Mod, Tokenizer, Args})
424
425       The tokenizer Function is either a fun or a tuple {Mod, Tokenizer}. The
426       call apply(Function, Args) or apply({Mod, Tokenizer}, Args) is executed
427       whenever a new token is needed. This, for example, makes it possible to
428       parse from a file, token by token.
429
430       The  tokenizer  used above has to be implemented so as to return one of
431       the following:
432
433       {ok, Tokens, EndPosition}
434       {eof, EndPosition}
435       {error, Error_description, EndPosition}
436
437       This conforms to the format used by the scanner in the  Erlang  io  li‐
438       brary module.
439
440       If   {eof,   EndPosition}   is   returned   immediately,  the  call  to
441       parse_and_scan/1 returns {ok, eof}. If {eof, EndPosition}  is  returned
442       before  the  parser  expects  end  of  input, parse_and_scan/1 will, of
443       course, return an error message (see above). Otherwise {ok, Result}  is
444       returned.
445

MORE EXAMPLES

447       1. A grammar for parsing infix arithmetic expressions into prefix nota‐
448       tion, without operator precedence:
449
450       Nonterminals E T F.
451       Terminals '+' '*' '(' ')' number.
452       Rootsymbol E.
453       E -> E '+' T: {'$2', '$1', '$3'}.
454       E -> T : '$1'.
455       T -> T '*' F: {'$2', '$1', '$3'}.
456       T -> F : '$1'.
457       F -> '(' E ')' : '$2'.
458       F -> number : '$1'.
459
460       2. The same with operator precedence becomes simpler:
461
462       Nonterminals E.
463       Terminals '+' '*' '(' ')' number.
464       Rootsymbol E.
465       Left 100 '+'.
466       Left 200 '*'.
467       E -> E '+' E : {'$2', '$1', '$3'}.
468       E -> E '*' E : {'$2', '$1', '$3'}.
469       E -> '(' E ')' : '$2'.
470       E -> number : '$1'.
471
472       3. An overloaded minus operator:
473
474       Nonterminals E uminus.
475       Terminals '*' '-' number.
476       Rootsymbol E.
477
478       Left 100 '-'.
479       Left 200 '*'.
480       Unary 300 uminus.
481
482       E -> E '-' E.
483       E -> E '*' E.
484       E -> uminus.
485       E -> number.
486
487       uminus -> '-' E.
488
489       4. The Yecc grammar that is used for parsing grammar  files,  including
490       itself:
491
492       Nonterminals
493       grammar declaration rule head symbol symbols attached_code
494       token tokens.
495       Terminals
496       atom float integer reserved_symbol reserved_word string char var
497       Rootsymbol grammar.
498       Endsymbol '$end'.
499       grammar -> declaration : '$1'.
500       grammar -> rule : '$1'.
501       declaration -> symbol symbols dot: {'$1', '$2'}.
502       rule -> head '->' symbols attached_code dot: {rule, ['$1' | '$3'],
503               '$4'}.
504       head -> symbol : '$1'.
505       symbols -> symbol : ['$1'].
506       symbols -> symbol symbols : ['$1' | '$2'].
507       attached_code -> ':' tokens : {erlang_code, '$2'}.
508       attached_code -> '$empty' : {erlang_code,
509                        [{atom, 0, '$undefined'}]}.
510       tokens -> token : ['$1'].
511       tokens -> token tokens : ['$1' | '$2'].
512       symbol -> var : value_of('$1').
513       symbol -> atom : value_of('$1').
514       symbol -> integer : value_of('$1').
515       symbol -> reserved_word : value_of('$1').
516       token -> var : '$1'.
517       token -> atom : '$1'.
518       token -> float : '$1'.
519       token -> integer : '$1'.
520       token -> string : '$1'.
521       token -> char : '$1'.
522       token -> reserved_symbol : {value_of('$1'), line_of('$1')}.
523       token -> reserved_word : {value_of('$1'), line_of('$1')}.
524       token -> '->' : {'->', line_of('$1')}.
525       token -> ':' : {':', line_of('$1')}.
526       Erlang code.
527       value_of(Token) ->
528           element(3, Token).
529       line_of(Token) ->
530           element(2, Token).
531
532   Note:
533       The  symbols '->', and ':' have to be treated in a special way, as they
534       are meta symbols of the grammar notation, as well as  terminal  symbols
535       of the Yecc grammar.
536
537
538       5.  The file erl_parse.yrl in the lib/stdlib/src directory contains the
539       grammar for Erlang.
540
541   Note:
542       Syntactic tests are used in the code associated with some rules, and an
543       error is thrown (and caught by the generated parser to produce an error
544       message) when a test fails. The same effect can be achieved with a call
545       to return_error(ErrorPosition, Message_string), which is defined in the
546       yeccpre.hrl default header file.
547
548

FILES

550       lib/parsetools/include/yeccpre.hrl
551