1MANDOC(3)                BSD Library Functions Manual                MANDOC(3)
2

NAME

4     mandoc, deroff, mandocmsg, man_mparse, man_validate, mdoc_validate,
5     mparse_alloc, mparse_free, mparse_getkeep, mparse_keep, mparse_open,
6     mparse_readfd, mparse_reset, mparse_result, mparse_strerror,
7     mparse_strlevel, mparse_updaterc — mandoc macro compiler library
8

SYNOPSIS

10     #include <sys/types.h>
11     #include <mandoc.h>
12
13     #define ASCII_NBRSP
14     #define ASCII_HYPH
15     #define ASCII_BREAK
16
17     struct mparse *
18     mparse_alloc(int options, enum mandocerr mmin, mandocmsg mmsg,
19         enum mandoc_os oe_e, char *os_s);
20
21     void
22     (*mandocmsg)(enum mandocerr errtype, enum mandoclevel level,
23         const char *file, int line, int col, const char *msg);
24
25     void
26     mparse_free(struct mparse *parse);
27
28     const char *
29     mparse_getkeep(const struct mparse *parse);
30
31     void
32     mparse_keep(struct mparse *parse);
33
34     int
35     mparse_open(struct mparse *parse, const char *fname);
36
37     enum mandoclevel
38     mparse_readfd(struct mparse *parse, int fd, const char *fname);
39
40     void
41     mparse_reset(struct mparse *parse);
42
43     void
44     mparse_result(struct mparse *parse, struct roff_man **man,
45         char **sodest);
46
47     const char *
48     mparse_strerror(enum mandocerr);
49
50     const char *
51     mparse_strlevel(enum mandoclevel);
52
53     void
54     mparse_updaterc(struct mparse *parse, enum mandoclevel *rc);
55
56     #include <roff.h>
57
58     void
59     deroff(char **dest, const struct roff_node *node);
60
61     #include <sys/types.h>
62     #include <mandoc.h>
63     #include <mdoc.h>
64
65     extern const char * const * mdoc_argnames;
66     extern const char * const * mdoc_macronames;
67
68     void
69     mdoc_validate(struct roff_man *mdoc);
70
71     #include <sys/types.h>
72     #include <mandoc.h>
73     #include <man.h>
74
75     extern const char * const * man_macronames;
76
77     const struct mparse *
78     man_mparse(const struct roff_man *man);
79
80     void
81     man_validate(struct roff_man *man);
82

DESCRIPTION

84     The mandoc library parses a UNIX manual into an abstract syntax tree
85     (AST).  UNIX manuals are composed of mdoc(7) or man(7), and may be mixed
86     with roff(7), tbl(7), and eqn(7) invocations.
87
88     The following describes a general parse sequence:
89
90     1.   initiate a parsing sequence with mchars_alloc(3) and mparse_alloc();
91
92     2.   open a file with open(2) or mparse_open();
93
94     3.   parse it with mparse_readfd();
95
96     4.   close it with close(2);
97
98     5.   retrieve the syntax tree with mparse_result();
99
100     6.   depending on whether the macroset member of the returned struct
101          roff_man is MACROSET_MDOC or MACROSET_MAN, validate it with
102          mdoc_validate() or man_validate(), respectively;
103
104     7.   if information about the validity of the input is needed, fetch it
105          with mparse_updaterc();
106
107     8.   iterate over parse nodes with starting from the first member of the
108          returned struct roff_man;
109
110     9.   free all allocated memory with mparse_free() and mchars_free(3), or
111          invoke mparse_reset() and go back to step 2 to parse new files.
112

REFERENCE

114     This section documents the functions, types, and variables available via
115     <mandoc.h>, with the exception of those documented in mandoc_escape(3)
116     and mchars_alloc(3).
117
118   Types
119     enum mandocerr
120     An error or warning message during parsing.
121
122     enum mandoclevel
123     A classification of an enum mandocerr as regards system operation.  See
124     the DIAGNOSTICS section in mandoc(1) regarding the meanings of the lev‐
125     els.
126
127     struct mparse
128     An opaque pointer to a running parse sequence.  Created with
129     mparse_alloc() and freed with mparse_free().  This may be used across
130     parsed input if mparse_reset() is called between parses.
131
132     mandocmsg
133     A prototype for a function to handle error and warning messages emitted
134     by the parser.
135
136   Functions
137     deroff()
138     Obtain a text-only representation of a struct roff_node, including text
139     contained in its child nodes.  To be used on children of the first member
140     of struct roff_man.  When it is no longer needed, the pointer returned
141     from deroff() can be passed to free(3).
142
143     man_mparse()
144     Get the parser used for the current output.  Declared in <man.h>, imple‐
145     mented in man.c.
146
147     man_validate()
148     Validate the MACROSET_MAN parse tree obtained with mparse_result().
149     Declared in <man.h>, implemented in man.c.
150
151     mdoc_validate()
152     Validate the MACROSET_MDOC parse tree obtained with mparse_result().
153     Declared in <mdoc.h>, implemented in mdoc.c.
154
155     mparse_alloc()
156     Allocate a parser.  The arguments have the following effect:
157
158          options  When the MPARSE_MDOC or MPARSE_MAN bit is set, only that
159                   parser is used.  Otherwise, the document type is automati‐
160                   cally detected.
161
162                   When the MPARSE_SO bit is set, roff(7) so file inclusion
163                   requests are always honoured.  Otherwise, if the request is
164                   the only content in an input file, only the file name is
165                   remembered, to be returned in the sodest argument of
166                   mparse_result().
167
168                   When the MPARSE_QUICK bit is set, parsing is aborted after
169                   the NAME section.  This is for example useful in
170                   makewhatis(8) -Q to quickly build minimal databases.
171
172          mmin     Can be set to MANDOCERR_BASE, MANDOCERR_STYLE,
173                   MANDOCERR_WARNING, MANDOCERR_ERROR, MANDOCERR_UNSUPP, or
174                   MANDOCERR_MAX.  Messages below the selected level will be
175                   suppressed.
176
177          mmsg     A callback function to handle errors and warnings.  See
178                   main.c for an example.  If printing of error messages is
179                   not desired, NULL may be passed.
180
181          os_e     Operating system to check base system conventions for.  If
182                   MANDOC_OS_OTHER, the system is automatically detected from
183                   Os, -Ios, or uname(3).
184
185          os_s     A default string for the mdoc(7) Os macro, overriding the
186                   OSNAME preprocessor definition and the results of uname(3).
187                   Passing NULL sets no default.
188
189     The same parser may be used for multiple files so long as mparse_reset()
190     is called between parses.  mparse_free() must be called to free the mem‐
191     ory allocated by this function.  Declared in <mandoc.h>, implemented in
192     read.c.
193
194     mparse_free()
195     Free all memory allocated by mparse_alloc().  Declared in <mandoc.h>,
196     implemented in read.c.
197
198     mparse_getkeep()
199     Acquire the keep buffer.  Must follow a call of mparse_keep().  Declared
200     in <mandoc.h>, implemented in read.c.
201
202     mparse_keep()
203     Instruct the parser to retain a copy of its parsed input.  This can be
204     acquired with subsequent mparse_getkeep() calls.  Declared in <mandoc.h>,
205     implemented in read.c.
206
207     mparse_open()
208     Open the file for reading.  If that fails and fname does not already end
209     in ‘.gz’, try again after appending ‘.gz’.  Save the information whether
210     the file is zipped or not.  Return a file descriptor open for reading or
211     -1 on failure.  It can be passed to mparse_readfd() or used directly.
212     Declared in <mandoc.h>, implemented in read.c.
213
214     mparse_readfd()
215     Parse a file descriptor opened with open(2) or mparse_open().  Pass the
216     associated filename in fname.  This function may be called multiple times
217     with different parameters; however, close(2) and mparse_reset() should be
218     invoked between parses.  Declared in <mandoc.h>, implemented in read.c.
219
220     mparse_reset()
221     Reset a parser so that mparse_readfd() may be used again.  Declared in
222     <mandoc.h>, implemented in read.c.
223
224     mparse_result()
225     Obtain the result of a parse.  One of the two pointers will be filled in.
226     Declared in <mandoc.h>, implemented in read.c.
227
228     mparse_strerror()
229     Return a statically-allocated string representation of an error code.
230     Declared in <mandoc.h>, implemented in read.c.
231
232     mparse_strlevel()
233     Return a statically-allocated string representation of a level code.
234     Declared in <mandoc.h>, implemented in read.c.
235
236     mparse_updaterc()
237     If the highest warning or error level that occurred during the current
238     parse is higher than *rc, update *rc accordingly.  This is useful after
239     calling mdoc_validate() or man_validate().  Declared in <mandoc.h>,
240     implemented in read.c.
241
242   Variables
243     man_macronames
244     The string representation of a man(7) macro as indexed by enum mant.
245
246     mdoc_argnames
247     The string representation of an mdoc(7) macro argument as indexed by enum
248     mdocargt.
249
250     mdoc_macronames
251     The string representation of an mdoc(7) macro as indexed by enum mdoct.
252

IMPLEMENTATION NOTES

254     This section consists of structural documentation for mdoc(7) and man(7)
255     syntax trees and strings.
256
257   Man and Mdoc Strings
258     Strings may be extracted from mdoc and man meta-data, or from text nodes
259     (MDOC_TEXT and MAN_TEXT, respectively).  These strings have special non-
260     printing formatting cues embedded in the text itself, as well as roff(7)
261     escapes preserved from input.  Implementing systems will need to handle
262     both situations to produce human-readable text.  In general, strings may
263     be assumed to consist of 7-bit ASCII characters.
264
265     The following non-printing characters may be embedded in text strings:
266
267     ASCII_NBRSP
268             A non-breaking space character.
269
270     ASCII_HYPH
271             A soft hyphen.
272
273     ASCII_BREAK
274             A breakable zero-width space.
275
276     Escape characters are also passed verbatim into text strings.  An escape
277     character is a sequence of characters beginning with the backslash (‘\’).
278     To construct human-readable text, these should be intercepted with
279     mandoc_escape(3) and converted with one the functions described in
280     mchars_alloc(3).
281
282   Man Abstract Syntax Tree
283     This AST is governed by the ontological rules dictated in man(7) and
284     derives its terminology accordingly.
285
286     The AST is composed of struct roff_node nodes with element, root and text
287     types as declared by the type field.  Each node also provides its parse
288     point (the line, pos, and sec fields), its position in the tree (the
289     parent, child, next and prev fields) and some type-specific data.
290
291     The tree itself is arranged according to the following normal form, where
292     capitalised non-terminals represent nodes.
293
294     ROOT       ← mnode+
295     mnode      ← ELEMENT | TEXT | BLOCK
296     BLOCK      ← HEAD BODY
297     HEAD       ← mnode*
298     BODY       ← mnode*
299     ELEMENT    ← ELEMENT | TEXT*
300     TEXT       ← [[:ascii:]]*
301
302     The only elements capable of nesting other elements are those with next-
303     line scope as documented in man(7).
304
305   Mdoc Abstract Syntax Tree
306     This AST is governed by the ontological rules dictated in mdoc(7) and
307     derives its terminology accordingly.  "In-line" elements described in
308     mdoc(7) are described simply as "elements".
309
310     The AST is composed of struct roff_node nodes with block, head, body,
311     element, root and text types as declared by the type field.  Each node
312     also provides its parse point (the line, pos, and sec fields), its posi‐
313     tion in the tree (the parent, child, last, next and prev fields) and some
314     type-specific data, in particular, for nodes generated from macros, the
315     generating macro in the tok field.
316
317     The tree itself is arranged according to the following normal form, where
318     capitalised non-terminals represent nodes.
319
320     ROOT       ← mnode+
321     mnode      ← BLOCK | ELEMENT | TEXT
322     BLOCK      ← HEAD [TEXT] (BODY [TEXT])+ [TAIL [TEXT]]
323     ELEMENT    ← TEXT*
324     HEAD       ← mnode*
325     BODY       ← mnode* [ENDBODY mnode*]
326     TAIL       ← mnode*
327     TEXT       ← [[:ascii:]]*
328
329     Of note are the TEXT nodes following the HEAD, BODY and TAIL nodes of the
330     BLOCK production: these refer to punctuation marks.  Furthermore,
331     although a TEXT node will generally have a non-zero-length string, in the
332     specific case of ‘.Bd -literal’, an empty line will produce a zero-length
333     string.  Multiple body parts are only found in invocations of ‘Bl
334     -column’, where a new body introduces a new phrase.
335
336     The mdoc(7) syntax tree accommodates for broken block structures as well.
337     The ENDBODY node is available to end the formatting associated with a
338     given block before the physical end of that block.  It has a non-null end
339     field, is of the BODY type, has the same tok as the BLOCK it is ending,
340     and has a pending field pointing to that BLOCK's BODY node.  It is an
341     indirect child of that BODY node and has no children of its own.
342
343     An ENDBODY node is generated when a block ends while one of its child
344     blocks is still open, like in the following example:
345
346           .Ao ao
347           .Bo bo ac
348           .Ac bc
349           .Bc end
350
351     This example results in the following block structure:
352
353           BLOCK Ao
354               HEAD Ao
355               BODY Ao
356                   TEXT ao
357                   BLOCK Bo, pending -> Ao
358                       HEAD Bo
359                       BODY Bo
360                           TEXT bo
361                           TEXT ac
362                           ENDBODY Ao, pending -> Ao
363                           TEXT bc
364           TEXT end
365
366     Here, the formatting of the Ao block extends from TEXT ao to TEXT ac,
367     while the formatting of the Bo block extends from TEXT bo to TEXT bc.  It
368     renders as follows in -Tascii mode:
369
370           <ao [bo ac> bc] end
371
372     Support for badly-nested blocks is only provided for backward compatibil‐
373     ity with some older mdoc(7) implementations.  Using badly-nested blocks
374     is strongly discouraged; for example, the -Thtml front-end to mandoc(1)
375     is unable to render them in any meaningful way.  Furthermore, behaviour
376     when encountering badly-nested blocks is not consistent across troff
377     implementations, especially when using multiple levels of badly-nested
378     blocks.
379

SEE ALSO

381     mandoc(1), man.cgi(3), mandoc_escape(3), mandoc_headers(3),
382     mandoc_malloc(3), mansearch(3), mchars_alloc(3), tbl(3), eqn(7), man(7),
383     mandoc_char(7), mdoc(7), roff(7), tbl(7)
384

AUTHORS

386     The mandoc library was written by Kristaps Dzonsons <kristaps@bsd.lv> and
387     is maintained by Ingo Schwarze <schwarze@openbsd.org>.
388
389BSD                              June 20, 2019                             BSD
Impressum