1BTPARSE(1)                          btparse                         BTPARSE(1)
2
3
4

NAME

6       btparse - C library for parsing and processing BibTeX data files
7

SYNOPSIS

9          #include <btparse.h>
10
11          /* Basic library initialization / cleanup */
12          void bt_initialize (void);
13          void bt_free_ast (AST *ast);
14          void bt_cleanup (void);
15
16          /* Input / interface to parser */
17          void   bt_set_stringopts (bt_metatype_t metatype, btshort options);
18          AST * bt_parse_entry_s (char *    entry_text,
19                                  char *    filename,
20                                  int       line,
21                                  btshort    options,
22                                  boolean * status);
23          AST * bt_parse_entry   (FILE *    infile,
24                                  char *    filename,
25                                  btshort    options,
26                                  boolean * status);
27          AST * bt_parse_file    (char *    filename,
28                                  btshort    options,
29                                  boolean * overall_status);
30
31          /* AST traversal/query */
32          AST * bt_next_entry (AST * entry_list,
33                               AST * prev_entry)
34          AST * bt_next_field (AST *entry, AST *prev, char **name);
35          AST * bt_next_value (AST *head,
36                               AST *prev,
37                               bt_nodetype_t *nodetype,
38                               char **text);
39
40          bt_metatype_t bt_entry_metatype (AST *entry);
41          char *bt_entry_type (AST *entry);
42          char *bt_entry_key (AST *entry);
43          char *bt_get_text (AST *node);
44
45          /* Splitting names and lists of names */
46          bt_stringlist * bt_split_list (char *   string,
47                                         char *   delim,
48                                         char *   filename,
49                                         int      line,
50                                         char *   description);
51          void bt_free_list (bt_stringlist *list);
52          bt_name * bt_split_name (char *  name,
53                                   char *  filename,
54                                   int     line,
55                                   int     name_num);
56          void bt_free_name (bt_name * name);
57
58          /* Formatting names */
59          bt_name_format * bt_create_name_format (char * parts, boolean abbrev_first);
60          void bt_free_name_format (bt_name_format * format);
61          void bt_set_format_text (bt_name_format * format,
62                                   bt_namepart part,
63                                   char * pre_part,
64                                   char * post_part,
65                                   char * pre_token,
66                                   char * post_token);
67          void bt_set_format_options (bt_name_format * format,
68                                      bt_namepart part,
69                                      boolean abbrev,
70                                      bt_joinmethod join_tokens,
71                                      bt_joinmethod join_part);
72          char * bt_format_name (bt_name * name, bt_name_format * format);
73
74          /* Construct tree from TeX groups */
75          bt_tex_tree * bt_build_tex_tree (char * string);
76          void          bt_free_tex_tree (bt_tex_tree **top);
77          void          bt_dump_tex_tree (bt_tex_tree *node, int depth, FILE *stream);
78          char *        bt_flatten_tex_tree (bt_tex_tree *top);
79
80          /* Miscellaneous string utilities */
81          void bt_purify_string (char * string, btshort options);
82          void bt_change_case (char transform, char * string, btshort options);
83

DESCRIPTION

85       btparse is a C library for parsing and processing BibTeX files.  It
86       provides a lexical scanner and LR parser (constructed by PCCTS), both
87       of which are efficient and offer good error detection and recovery; a
88       set of functions for traversing the AST (abstract syntax tree)
89       generated by the parser; and utility functions for manipulating strings
90       according to BibTeX conventions.  (Note that nothing in the library
91       assumes that you're using BibTeX files for their original purpose of
92       bibliographic data for scholarly publications; you could use the file
93       format for any conceivable purpose that fits it.  However, there is
94       some code in the library that is really only appropriate for use with
95       strings meant to be processed in the same way that BibTeX itself does.
96       This is all entirely optional, though.)
97
98       Note that the interface provided by btparse, while complete, is fairly
99       low-level.  If you have more sophisticated needs, you might be
100       interested my "Text::BibTeX" module for Perl 5 (available on CPAN).
101

CONCEPTS AND TERMINOLOGY

103       To understand this document and use btparse, you should already be
104       familiar with the BibTeX language---more specifically, the BibTeX data
105       description language.  (BibTeX being the complex beast that it is, one
106       can conceive of the term applying to the program, the data language,
107       the particular database structure described in the original BibTeX
108       documentation, the ".bst" formatting language, and the set of
109       conventions embodied in the standard styles included with the BibTeX
110       distribution.  In this document, I'll stick to the first two
111       meanings---the data language because that's what btparse deals with,
112       and the program because it's occasionally necessary to explain
113       differences between my parser and BibTeX's.)
114
115       In particular, you should have a good idea what's going on in the
116       following:
117
118          @string{and = { and },
119                  joe = "Blow, Joe",
120                  john = "John Smith"}
121
122          @book(ourbook,
123                author = joe # and # john,
124                title = {Our Little Book})
125
126       If this looks like something you want to parse, but don't want to have
127       to write your own parser for, you've come to the right place.
128
129       Before going much further, though, you're going to have to learn some
130       of the terminology I use for describing BibTeX data.  Most of it's the
131       same as you'll find in any BibTeX documentation, but it's important to
132       be sure that we're talking about the same things here.  So, some
133       definitions:
134
135       top-level
136           All text in a BibTeX file from the start of the file to the start
137           of the first entry, and between entries thereafter.
138
139       name
140           A string of letters, digits, and the following characters:
141
142              ! $ & * + - . / : ; < > ? [ ] ^ _ ` |
143
144           A "name" is a catch-all used for entry types, entry keys, and field
145           and macro names.  For BibTeX compatibility, there are slightly
146           different rules for these four entities; currently, the only such
147           rule actually implemented is that field and macro names may not
148           begin with a digit.  Some names in the above example: "string",
149           "and".
150
151       entry
152           A chunk of text starting with an "at" sign ("@") at top-level,
153           followed by a name (the entry type), an entry delimiter ("{" or
154           "("), and proceeding to the matching closing delimiter.  Also, the
155           data structure that results from parsing this chunk of text.  There
156           are two entries in the above example.
157
158       entry type
159           The name that comes right after an "@" at top-level.  Examples from
160           above: "string", "book".
161
162       entry metatype
163           A classification of entry types that allows us to group one or more
164           entry types under the same heading.  With the standard BibTeX
165           database structure, "article", "book", "inbook", etc. all fall
166           under the "regular entry" metatype.  Other metatypes are "macro
167           definition" (for "string" entries), "preamble" (for "preamble")
168           entries, and "comment" ("comment" entries).  In fact, any entry
169           whose type is not one of "string", "preamble", or "comment" is
170           called a "regular" entry.
171
172       entry delimiters
173           "{" and "}", or "(" and ")": the pair of characters that (almost)
174           mark the boundaries of an entry.  "Almost" because the start of an
175           entry is marked by an "@", not by the "entry open" delimiter.
176
177       entry key
178           (Or just key when it's clear what we're speaking of.)  The name
179           immediately following the entry open delimiter in a regular entry,
180           which uniquely identifies the entry.  Example from above:
181           "ourbook".  Only regular entries have keys.
182
183       field
184           A name to the left of an equals sign in a regular or macro-
185           definition entry.  In the latter context, might also be called a
186           macro name.  Examples from above: "joe", "author".
187
188       field list
189           In a regular entry, everything between the entry delimiters except
190           for the entry key.  In a macro definition entry, everything between
191           the entry delimiters (possibly also called a macro list).
192
193       compound value
194           (Usually just "value".)  The text that follows an equals sign ("=")
195           in a regular or macro definition entry, up to a comma or the entry
196           close delimiter; a list of one or more simple values joined by hash
197           signs ("#").
198
199       simple value
200           A string, macro, or number.
201
202       string
203           (Or, sometimes, "quoted string.")  A chunk of text between quotes
204           (""") or braces ("{" and "}").  Braces must balance: "{this is a
205           {string}" is not a BibTeX string, but "{this is a {string}}" is.
206           ("this is a {string" is also illegal, mainly to avoid the
207           possibility of generating bogus TeX code--which BibTeX will do in
208           certain cases.)
209
210       macro
211           A name that appears on the right-hand side of an equals sign (i.e.
212           as one simple value in a compound value).  Implies that this name
213           was defined as a macro in an earlier macro definition entry, but
214           this is only checked if btparse is being asked to expand macros to
215           their full definitions.
216
217       number
218           An unquoted string of digits.
219
220       Working with btparse generally consists of passing the library some
221       BibTeX data (or a source for some BibTeX data, such as a filename or a
222       file pointer), which it then lexically scans, parses, and constructs an
223       abstract syntax tree (AST) from.  It returns this AST to you, and you
224       call other btparse functions to traverse and query the tree.
225
226       The contents of AST nodes are the private domain of the library, and
227       you shouldn't go poking into them.  This being C, though, there's
228       nothing to prevent you from doing so except good manners and the
229       possibility that I might change the AST structure in future releases,
230       breaking any badly-behaved code.  Also, it's not necessary to know the
231       structural relationships between nodes in the AST---that's taken care
232       of by the query/traversal functions.
233
234       However, it's useful to know some of the things that btparse deposits
235       in the AST and returns to you through those query/traversal functions.
236       First off, each node has a "node type," which records the syntactic
237       element corresponding to each node.  For instance, the entry
238
239          @book{mybook, author = "Joe Blow", title = "My Little Book"}
240
241       is rooted by an "entry" node; under this would be found a "key" node
242       (for the entry key), two "field" nodes (for the "author" and "title"
243       fields); and associated with each field node would be a "string" node.
244       The only time this concerns you is when you ask the library for a
245       simple value; just looking at the text is not enough to distinguish
246       quoted strings, numbers, and macro names, so btparse returns the
247       nodetype as well.
248
249       In addition to the nodetype, btparse records the metatype of each
250       "entry" node.  This allows you (and the library) to distinguish, say,
251       regular entries from comment entries.  Not only do they have very
252       different structures and must therefore be traversed differently by the
253       library, but certain traversal functions make no sense on certain entry
254       metatypes---thus it's necessary for you to be able to make the
255       distinction as well.
256
257       That said, everything you need to know to work with the AST is
258       explained in bt_traversal.
259

DATA TYPES AND MACROS

261       btparse defines several types required for the external interface.
262       First, it trivially defines a "boolean" type (along with "TRUE" and
263       "FALSE" macros).  This might affect you when including the btparse.h
264       header in your own code---since it's not possible for the code to
265       detect if there is already a "boolean" type defined, you might have to
266       define the "HAVE_BOOLEAN" pre-processor token to deactivate btparse.h's
267       "typedef" of "boolean".
268
269       Next, two enumeration types are defined: "bt_metatype" and
270       "bt_nodetype".  Both of these are used extensively in the library
271       itself, and are made available to users of the library because they can
272       be found in nodes of the "btparse" AST (abstract syntax tree).  (I.e.,
273       querying the AST can give you "bt_metatype" and "bt_nodetype" values,
274       so the "typedef"s must be available to your code.)
275
276   Entry metatype enum
277       "bt_metatype_t" has the following values:
278
279       ·   "BTE_UNKNOWN"
280
281       ·   "BTE_REGULAR"
282
283       ·   "BTE_COMMENT"
284
285       ·   "BTE_PREAMBLE"
286
287       ·   "BTE_MACRODEF"
288
289       which are determined by the "entry type" token.  (@string entries have
290       the "BTE_MACRODEF" metatype; @comment and @preamble correspond to
291       "BTE_COMMENT" and "BTE_PREAMBLE"; and any other entry type has the
292       "BTE_REGULAR" metatype.)
293
294   AST nodetype enum
295       "bt_nodetype" has the following values:
296
297       ·   "BTAST_UNKNOWN"
298
299       ·   "BTAST_ENTRY"
300
301       ·   "BTAST_KEY"
302
303       ·   "BTAST_FIELD"
304
305       ·   "BTAST_STRING"
306
307       ·   "BTAST_NUMBER"
308
309       ·   "BTAST_MACRO"
310
311       Of these, you'll only ever deal with the last three.  They are returned
312       when you query the AST for a simple value---just seeing the text isn't
313       enough to distinguish between a quoted string, a number, and a macro,
314       so the AST nodetype is supplied along with the text.
315
316   String processing option macros
317       Since BibTeX is essentially a system for glueing strings together in a
318       wide variety of ways, the processing done to its strings is fairly
319       important.  Most of the string transformations are done outside of the
320       lexer/parser; this reduces their complexity, and makes it easier to
321       switch different transformations on and off.  This switching is done
322       with an "options" bitmap which can be specified on a per-entry-metatype
323       basis.  (That is, you can have one set of transformations done to the
324       strings in all regular entries, another set done to the strings in all
325       macro definition entries, and so on.)  If you need finer control than
326       that, it's currently unavailable outside of the library (but it's just
327       a matter of making a couple functions available and documenting
328       them---so bug me if you need this feature).
329
330       There are three basic macros for constructing this bitmap:
331
332       "BTO_CONVERT"
333           Convert "number" values to strings.  (The conversion is trivial,
334           involving changing the type of the AST node representing the number
335           from "BTAST_NUMBER" to "BTAST_STRING".  "Number" values are stored
336           as strings of digits, just as they are in the input data.)
337
338       "BTO_EXPAND"
339           Expand macro invocations to the full macro text.
340
341       "BTO_PASTE"
342           Paste simple values together.
343
344       "BTO_COLLAPSE"
345           Collapse whitespace according to the BibTeX rules.
346
347       For instance, supplying "BTO_CONVERT | BTO_EXPAND" as the string
348       options bitmap for the "BTE_REGULAR" metatype means that all simple
349       values in "regular" entries will be converted to strings: numbers will
350       simply have their "nodetype" changed, and macros will be expanded.
351       Nothing else will be done to the simple values, though---they will not
352       be concatenated, nor will whitespace be collapsed.  See the
353       "bt_set_stringopts()" and "bt_parse_*()" functions in bt_input for more
354       information on the various options for parsing; see bt_postprocess for
355       details on the post-processing.
356

USING THE LIBRARY

358       The following code is a skeletal example of using the btparse library:
359
360           #include <btparse.h>
361
362           int main (void)
363           {
364              bt_initialize ();
365
366              /* process some data */
367
368              bt_cleanup ();
369              exit (0);
370           }
371
372       Please note the call to "bt_initialize()"; this is very important!
373       Without it, the library may crash or fail mysteriously.  You must call
374       "bt_initialize()" before calling any other btparse functions.
375       "bt_cleanup()" just frees the memory allocated by "bt_initialize()"; if
376       you are careful to call it before exiting, and "bt_free_ast()" on any
377       abstract syntax trees generated by btparse when you are done with them,
378       then your program shouldn't have any memory leaks.  (Unless they're due
379       to your own code, of course!)
380

BUGS AND LIMITATIONS

382       btparse has several inherent limitations that are due to the lexical
383       scanner and parser generated by PCCTS 1.x.  In short, the scanner and
384       parser are both heavily dependent on global variables, meaning that
385       thread safety -- or even the ability to have two files open and being
386       parsed at the same time -- is well-nigh impossible.  This will not
387       change until I get with the times and adopt ANTLR 2.0, the successor to
388       PCCTS -- presuming of course that it can generate more modular C
389       scanners and parsers.
390
391       Another limitation that is due to PCCTS: entries with a large number of
392       fields (more than about 90, if each field value is just a single
393       string) will cause the parser to crash.  This is unavoidable due to the
394       parser using statically-allocated stacks for attributes and abstract-
395       syntax tree nodes.  I could increase the static allocation, but that
396       would just decrease the likelihood of encountering the problem, not
397       make it go away.  Again, the chances of this changing as long as I'm
398       using PCCTS 1.x are nil.
399
400       Apart from those inherent limitations, there are no known bugs in
401       btparse.  Any segmentation faults or bus errors from the library should
402       be considered bugs.  They probably result from using the library
403       incorrectly (eg. attempting to interleave the parsing of two files),
404       but I do make an attempt to catch all such mistakes, and if I've missed
405       any I'd like to know about it.
406
407       Any memory leaks from the library are also a concern; as long as you
408       are conscientious about calling the cleanup functions ("bt_free_ast()"
409       and "bt_cleanup()"), then the library shouldn't leak.
410

SEE ALSO

412       To read and parse BibTeX data files, see bt_input.
413
414       To traverse the syntax tree that results, see bt_traversal.
415
416       To learn what is done to values in parsed entries, and how to customize
417       that munging, see bt_postprocess.
418
419       To learn how btparse deals with strings, see bt_strings (oops, I
420       haven't written this one yet!).
421
422       To manipulate and access the btparse macro table, see bt_macros.
423
424       For splitting author names and lists "the BibTeX way" using btparse,
425       bt_split_names.
426
427       To put author names back together again, see bt_format_names.
428
429       Miscellaneous functions for processing strings "the BibTeX way":
430       bt_misc.
431
432       A semi-formal language definition is in bt_language.
433

AUTHOR

435       Greg Ward <gward@python.net>
436
438       Copyright (c) 1996-97 by Gregory P. Ward.
439
440       This library is free software; you can redistribute it and/or modify it
441       under the terms of the GNU Library General Public License as published
442       by the Free Software Foundation; either version 2 of the License, or
443       (at your option) any later version.
444
445       This library is distributed in the hope that it will be useful, but
446       WITHOUT ANY WARRANTY; without even the implied warranty of
447       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
448       Library General Public License for more details.
449
450       You should have received a copy of the GNU Library General Public
451       License along with this library; if not, write to the Free Software
452       Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
453

AVAILABILITY

455       The btOOL home page, where you can get up-to-date information about
456       btparse (and download the latest version) is
457
458          http://starship.python.net/~gward/btOOL/
459
460       You will also find the latest version of Text::BibTeX, the Perl library
461       that provides a high-level front-end to btparse, there.  btparse is
462       needed to build "Text::BibTeX", and must be downloaded separately.
463
464       Both libraries are also available on CTAN (the Comprehensive TeX
465       Archive Network, "http://www.ctan.org/tex-archive/") and CPAN (the
466       Comprehensive Perl Archive Network, "http://www.cpan.org/").  Look in
467       biblio/bibtex/utils/btOOL/ on CTAN, and authors/Greg_Ward/ on CPAN.
468       For example,
469
470          http://www.ctan.org/tex-archive/biblio/bibtex/utils/btOOL/
471          http://www.cpan.org/authors/Greg_Ward
472
473       will both get you to the latest version of "Text::BibTeX" and btparse
474       -- but of course, you should always access busy sites like CTAN and
475       CPAN through a mirror.
476
477
478
479btparse, version 0.88             2020-01-30                        BTPARSE(1)
Impressum