1leex(3)                    Erlang Module Definition                    leex(3)
2
3
4

NAME

6       leex - Lexical analyzer generator for Erlang
7

DESCRIPTION

9       A regular expression based lexical analyzer generator for Erlang, simi‐
10       lar to lex or flex.
11
12   Note:
13       The Leex module should be considered experimental as it will be subject
14       to changes in future releases.
15
16

DATA TYPES

18       error_info() =
19           {erl_anno:line() | none, module(), ErrorDescriptor :: term()}
20
21              The  standard  error_info()  structure that is returned from all
22              I/O modules. ErrorDescriptor is formattable by format_error/1.
23

EXPORTS

25       file(FileName) -> leex_ret()
26
27       file(FileName, Options) -> leex_ret()
28
29              Types:
30
31                 FileName = file:filename()
32                 Options = Option | [Option]
33                 Option =
34                     {dfa_graph, boolean()} |
35                     {includefile, Includefile :: file:filename()} |
36                     {report_errors, boolean()} |
37                     {report_warnings, boolean()} |
38                     {report, boolean()} |
39                     {return_errors, boolean()} |
40                     {return_warnings, boolean()} |
41                     {return, boolean()} |
42                     {scannerfile, Scannerfile :: file:filename()} |
43                     {verbose, boolean()} |
44                     {warnings_as_errors, boolean()} |
45                     {deterministic, boolean()} |
46                     {error_location, line | column} |
47                     {tab_size, integer() >= 1} |
48                     dfa_graph | report_errors | report_warnings | report |
49                     return_errors | return_warnings | return | verbose |
50                     warnings_as_errors
51                 leex_ret() = ok_ret() | error_ret()
52                 ok_ret() =
53                     {ok, Scannerfile :: file:filename()} |
54                     {ok, Scannerfile :: file:filename(), warnings()}
55                 error_ret() =
56                     error | {error, Errors :: errors(), Warnings :: warnings()}
57                 errors() = [{file:filename(), [error_info()]}]
58                 warnings() = [{file:filename(), [error_info()]}]
59
60              Generates a lexical analyzer from the definition  in  the  input
61              file.  The  input  file has the extension .xrl. This is added to
62              the filename if it is not given. The resulting module is the Xrl
63              filename without the .xrl extension.
64
65              The current options are:
66
67                dfa_graph:
68                  Generates  a  .dot  file which contains a description of the
69                  DFA  in  a  format  which  can  be  viewed  with   Graphviz,
70                  www.graphviz.com.
71
72                {includefile,Includefile}:
73                  Uses  a  specific or customised prologue file instead of de‐
74                  fault lib/parsetools/include/leexinc.hrl which is  otherwise
75                  included.
76
77                {report_errors, boolean()}:
78                  Causes errors to be printed as they occur. Default is true.
79
80                {report_warnings, boolean()}:
81                  Causes  warnings  to  be  printed  as they occur. Default is
82                  true.
83
84                {report, boolean()}:
85                  This is a short form for both report_errors and report_warn‐
86                  ings.
87
88                {return_errors, boolean()}:
89                  If  this  flag is set, {error, Errors, Warnings} is returned
90                  when there are errors. Default is false.
91
92                {return_warnings, boolean()}:
93                  If this flag is set, an extra field containing  Warnings  is
94                  added to the tuple returned upon success. Default is false.
95
96                {return, boolean()}:
97                  This is a short form for both return_errors and return_warn‐
98                  ings.
99
100                {scannerfile, Scannerfile}:
101                  Scannerfile is the name of the file that  will  contain  the
102                  Erlang  scanner  code that is generated. The default ("") is
103                  to add the extension .erl to FileName stripped of  the  .xrl
104                  extension.
105
106                {verbose, boolean()}:
107                  Outputs information from parsing the input file and generat‐
108                  ing the internal tables.
109
110                {warnings_as_errors, boolean()}:
111                  Causes warnings to be treated as errors.
112
113                {deterministic, boolean()}:
114                  Causes generated -file()  attributes  to  only  include  the
115                  basename of the file path.
116
117                {error_location, line | column}:
118                  If set to column, error location will be {Line,Column} tuple
119                  instead of just Line. Also, StartLoc and EndLoc in string/2,
120                  token/3,  and tokens/3 functions will be {Line,Column} tuple
121                  instead of just Line. Default is line. Note that you can use
122                  TokenLoc  for  token location independently, even if the er‐
123                  ror_location is set to line.
124
125                  Unicode characters are counted as many columns as  they  use
126                  bytes to represent.
127
128                {tab_size, pos_integer()}:
129                  Sets the width of \t character (only relevant if error_loca‐
130                  tion is set to column). Default is 8.
131
132              Any of the Boolean options can be set to  true  by  stating  the
133              name  of the option. For example, verbose is equivalent to {ver‐
134              bose, true}.
135
136              Leex will add the extension .hrl to the Includefile name and the
137              extension  .erl to the Scannerfile name, unless the extension is
138              already there.
139
140       format_error(ErrorDescriptor) -> io_lib:chars()
141
142              Types:
143
144                 ErrorDescriptor = term()
145
146              Returns a descriptive string in English of an error  reason  Er‐
147              rorDescriptor  returned  by leex:file/1,2 when there is an error
148              in a regular expression.
149

GENERATED SCANNER EXPORTS

151       The following functions are exported by the generated scanner.
152

EXPORTS

154       Module:string(String) -> StringRet
155       Module:string(String, StartLoc) -> StringRet
156
157              Types:
158
159                 String = string()
160                 StringRet = {ok,Tokens,EndLoc} | ErrorInfo
161                 Tokens = [Token]
162                 StartLoc = EndLoc = erl_anno:location()
163
164              Scans String and returns all the tokens  in  it,  or  an  error.
165              StartLoc and EndLoc are either erl_anno:line() or erl_anno:loca‐
166              tion(), depending on the error_location option.
167
168          Note:
169              It is an error if not all of the characters in String  are  con‐
170              sumed.
171
172
173       Module:token(Cont, Chars) -> {more,Cont1} | {done,TokenRet,RestChars}
174       Module:token(Cont,  Chars,  StartLoc)  ->  {more,Cont1}  | {done,Token‐
175       Ret,RestChars}
176
177              Types:
178
179                 Cont = [] | Cont1
180                 Cont1 = tuple()
181                 Chars = RestChars = string() | eof
182                 TokenRet = {ok, Token, EndLoc} | {eof, EndLoc} | ErrorInfo
183                 StartLoc = EndLoc = erl_anno:location()
184
185              This is a re-entrant call to try and scan one token from  Chars.
186              If  there  are enough characters in Chars to either scan a token
187              or detect an error then this will be returned  with  {done,...}.
188              Otherwise {cont,Cont} will be returned where Cont is used in the
189              next call to token() with more characters to try an scan the to‐
190              ken.  This  is continued until a token has been scanned. Cont is
191              initially [].
192
193              It is not designed to be called directly by an  application  but
194              used  through the i/o system where it can typically be called in
195              an application by:
196
197              io:request(InFile, {get_until,unicode,Prompt,Module,token,[Loc]})
198                -> TokenRet
199
200       Module:tokens(Cont, Chars) -> {more,Cont1} | {done,TokensRet,RestChars}
201       Module:tokens(Cont, Chars, StartLoc)  ->  {more,Cont1}  |  {done,Token‐
202       sRet,RestChars}
203
204              Types:
205
206                 Cont = [] | Cont1
207                 Cont1 = tuple()
208                 Chars = RestChars = string() | eof
209                 TokensRet = {ok, Tokens, EndLoc} | {eof, EndLoc} | ErrorInfo
210                 Tokens = [Token]
211                 StartLoc = EndLoc = erl_anno:location()
212
213              This  is a re-entrant call to try and scan tokens from Chars. If
214              there are enough characters in Chars to either  scan  tokens  or
215              detect an error then this will be returned with {done,...}. Oth‐
216              erwise {cont,Cont} will be returned where Cont is  used  in  the
217              next  call  to  tokens() with more characters to try an scan the
218              tokens. This is continued until all tokens  have  been  scanned.
219              Cont is initially [].
220
221              This  functions  differs  from token in that it will continue to
222              scan tokens up to and including an  {end_token,Token}  has  been
223              scanned  (see next section). It will then return all the tokens.
224              This is typically used for scanning grammars like  Erlang  where
225              there  is  an  explicit end token, '.'. If no end token is found
226              then the whole file will be scanned and returned.  If  an  error
227              occurs  then  all  tokens up to and including the next end token
228              will be skipped.
229
230              It is not designed to be called directly by an  application  but
231              used  through the i/o system where it can typically be called in
232              an application by:
233
234              io:request(InFile, {get_until,unicode,Prompt,Module,tokens,[Loc]})
235                -> TokensRet
236

DEFAULT LEEX OPTIONS

238       The (host operating system) environment  variable  ERL_COMPILER_OPTIONS
239       can be used to give default Leex options. Its value must be a valid Er‐
240       lang term. If the value is a list, it is used as is. If  it  is  not  a
241       list, it is put into a list.
242
243       The list is appended to any options given to file/2.
244
245       The list can be retrieved with  compile:env_compiler_options/0.
246

INPUT FILE FORMAT

248       Erlang style comments starting with a % are allowed in scanner files. A
249       definition file has the following format:
250
251       <Header>
252
253       Definitions.
254
255       <Macro Definitions>
256
257       Rules.
258
259       <Token Rules>
260
261       Erlang code.
262
263       <Erlang code>
264
265       The "Definitions.", "Rules." and "Erlang code." headings are  mandatory
266       and  must occur at the beginning of a source line. The <Header>, <Macro
267       Definitions> and <Erlang code> sections may be empty but there must  be
268       at least one rule.
269
270       Macro definitions have the following format:
271
272       NAME = VALUE
273
274       and  there  must  be spaces around =. Macros can be used in the regular
275       expressions of rules by writing {NAME}.
276
277   Note:
278       When macros are expanded in expressions the macro calls are replaced by
279       the  macro  value without any form of quoting or enclosing in parenthe‐
280       ses.
281
282
283       Rules have the following format:
284
285       <Regexp> : <Erlang code>.
286
287       The <Regexp> must occur at the start of a  line  and  not  include  any
288       blanks;  use \t and \s to include TAB and SPACE characters in the regu‐
289       lar expression. If <Regexp>  matches  then  the  corresponding  <Erlang
290       code>  is  evaluated to generate a token. With the Erlang code the fol‐
291       lowing predefined variables are available:
292
293         TokenChars:
294           A list of the characters in the matched token.
295
296         TokenLen:
297           The number of characters in the matched token.
298
299         TokenLine:
300           The line number where the token occurred.
301
302         TokenCol:
303           The column number where the token occurred  (column  of  the  first
304           character included in the token).
305
306         TokenLoc:
307           Token  location.  Expands  to  {TokenLine,TokenCol}  (even when er‐
308           ror_location is set to line.
309
310       The code must return:
311
312         {token,Token}:
313           Return Token to the caller.
314
315         {end_token,Token}:
316           Return Token and is last token in a tokens call.
317
318         skip_token:
319           Skip this token completely.
320
321         {error,ErrString}:
322           An error in the token, ErrString is a string describing the error.
323
324       It is also possible to push back characters into the  input  characters
325       with the following returns:
326
327         * {token,Token,PushBackList}
328
329         * {end_token,Token,PushBackList}
330
331         * {skip_token,PushBackList}
332
333       These  have  the same meanings as the normal returns but the characters
334       in PushBackList will be prepended to the input characters  and  scanned
335       for the next token. Note that pushing back a newline will mean the line
336       numbering will no longer be correct.
337
338   Note:
339       Pushing back characters gives you unexpected possibilities to cause the
340       scanner to loop!
341
342
343       The  following example would match a simple Erlang integer or float and
344       return a token which could be sent to the Erlang parser:
345
346       D = [0-9]
347
348       {D}+ :
349         {token,{integer,TokenLine,list_to_integer(TokenChars)}}.
350
351       {D}+\.{D}+((E|e)(\+|\-)?{D}+)? :
352         {token,{float,TokenLine,list_to_float(TokenChars)}}.
353
354       The Erlang code in the "Erlang code." section is written into the  out‐
355       put  file  directly after the module declaration and predefined exports
356       declaration so it is possible to add extra exports, define imports  and
357       other attributes which are then visible in the whole file.
358

REGULAR EXPRESSIONS

360       The  regular  expressions  allowed here is a subset of the set found in
361       egrep and in the AWK programming language, as defined in the book,  The
362       AWK  Programming  Language,  by A. V. Aho, B. W. Kernighan, P. J. Wein‐
363       berger. They are composed of the following characters:
364
365         c:
366           Matches the non-metacharacter c.
367
368         \c:
369           Matches the escape sequence or literal character c.
370
371         .:
372           Matches any character.
373
374         ^:
375           Matches the beginning of a string.
376
377         $:
378           Matches the end of a string.
379
380         [abc...]:
381           Character class, which matches any of the characters abc....  Char‐
382           acter  ranges  are specified by a pair of characters separated by a
383           -.
384
385         [^abc...]:
386           Negated character class, which matches any character except abc....
387
388         r1 | r2:
389           Alternation. It matches either r1 or r2.
390
391         r1r2:
392           Concatenation. It matches r1 and then r2.
393
394         r+:
395           Matches one or more rs.
396
397         r*:
398           Matches zero or more rs.
399
400         r?:
401           Matches zero or one rs.
402
403         (r):
404           Grouping. It matches r.
405
406       The escape sequences allowed are the same as for Erlang strings:
407
408         \b:
409           Backspace.
410
411         \f:
412           Form feed.
413
414         \n:
415           Newline (line feed).
416
417         \r:
418           Carriage return.
419
420         \t:
421           Tab.
422
423         \e:
424           Escape.
425
426         \v:
427           Vertical tab.
428
429         \s:
430           Space.
431
432         \d:
433           Delete.
434
435         \ddd:
436           The octal value ddd.
437
438         \xhh:
439           The hexadecimal value hh.
440
441         \x{h...}:
442           The hexadecimal value h....
443
444         \c:
445           Any other character literally, for example \\ for backslash, \" for
446           ".
447
448       The  following examples define simplified versions of a few Erlang data
449       types:
450
451       Atoms [a-z][0-9a-zA-Z_]*
452
453       Variables [A-Z_][0-9a-zA-Z_]*
454
455       Floats (\+|-)?[0-9]+\.[0-9]+((E|e)(\+|-)?[0-9]+)?
456
457   Note:
458       Anchoring a regular expression with ^ and $ is not implemented  in  the
459       current version of Leex and just generates a parse error.
460
461
462
463Ericsson AB                     parsetools 2.5                         leex(3)
Impressum