csv(3) - f31

1CSV(3)                     Library Functions Manual                     CSV(3)
2
3
4

NAME

6       csv - CSV parser and writer library
7

SYNOPSIS

9       #include <libcsv/csv.h>
10
11       int csv_init(struct csv_parser *p, unsigned char options);
12       size_t csv_parse(struct csv_parser *p,
13               const void *s,
14               size_t len,
15               void (*cb1)(void *, size_t, void *),
16               void (*cb2)(int, void *),
17               void *data);
18       int csv_fini(struct csv_parser *p,
19               void (*cb1)(void *, size_t, void *),
20               void (*cb2)(int, void *),
21               void *data);
22       void csv_free(struct csv_parser *p);
23
24       unsigned char csv_get_delim(struct csv_parser *p);
25       unsigned char csv_get_quote(struct csv_parser *p);
26       void csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char));
27       void csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char));
28
29       int csv_get_opts(struct csv_parser *p);
30       int csv_set_opts(struct csv_parser *p, unsigned char options);
31       int csv_error(struct csv_parser *p);
32       char * csv_strerror(int error);
33
34       size_t csv_write(void *dest, size_t dest_size, const void *src,
35               size_t src_size);
36       int csv_fwrite(FILE *fp, const void *src, size_t src_size);
37
38       size_t csv_write2(void *dest, size_t dest_size, const void *src,
39               size_t src_size, unsigned char quote);
40       int csv_fwrite2(FILE *fp, const void *src, size_t src_size, unsigned char quote);
41
42       void csv_set_realloc_func(struct csv_parser *p, void *(*func)(void *, size_t));
43       void csv_set_free_func(struct csv_parser *p, void (*func)(void *));
44       void csv_set_blk_size(struct csv_parser *p, size_t size);
45       size_t csv_get_blk_size(struct csv_parser *p);
46       size_t csv_get_buffer_size(struct csv_parser *p);
47
48

DESCRIPTION

50       The  CSV  library  provides a flexible, intuitive interface for parsing
51       and writing csv data.
52
53

OVERVIEW

55       The idea behind parsing with libcsv is straight-forward: you initialize
56       a parser object with csv_init() and feed data to the parser over one or
57       more calls to csv_parse() providing callback functions that handle end-
58       of-field  and  end-of-row events.  csv_parse() parses the data provided
59       calling the user-defined callback functions  as  it  reads  fields  and
60       rows.   When  complete,  csv_fini()  is called to finish processing the
61       current field and make a final call to the callback functions  if  nec‐
62       cessary.   csv_free()  is  then  called  to  free  the  parser  object.
63       csv_error() and csv_strerror() provide information about errors encoun‐
64       tered  by the functions.  csv_write() and csv_fwrite() provide a simple
65       interface for converting raw data into CSV data and storing the  result
66       into a buffer or file respectively.
67
68       CSV  is  a binary format allowing the storage of arbitrary binary data,
69       files opened for reading or writing CSV data should be opened in binary
70       mode.
71
72       libcsv provides a default mode in which the parser will happily process
73       any data as CSV without complaint, this is  useful  for  parsing  files
74       which  don't adhere to all the traditional rules. A strict mode is also
75       supported which will cause any violation of the imposed rules to  cause
76       a parsing failure.
77
78

ROUTINES

80   PARSING DATA
81       csv_init()  initializes  a  pointer  to  a  csv_parser structure.  This
82       structure contains housekeeping information such as the  current  state
83       of  the  parser,  the  buffer,  current  size  and  position, etc.  The
84       csv_init() function returns 0 on success  and  a  non-zero  value  upon
85       failure.   csv_init()  will  fail if the pointer passed to it is a null
86       pointer.  The options argument specifies the parser options, these  may
87       be changed later with the csv_set_opts() function.
88
89       OPTIONS
90
91              CSV_STRICT
92                     Enables strict mode.
93
94              CSV_REPALL_NL
95                     Causes  each  instance  of  a carriage return or linefeed
96                     outside of a record to be reported.
97
98              CSV_STRICT_FINI
99                     Causes  unterminated   quoted   fields   encountered   in
100                     csv_fini() to cause a parsing error (see below).
101
102              CSV_APPEND_NULL
103                     Will  cause all fields to be nul-terminated when provided
104                     to cb1, introduced in 3.0.0.
105
106              CSV_EMPTY_IS_NULL
107                     Will cause NULL to be passed as the first argument to cb1
108                     for empty, unquoted, fields.  Empty means consisting only
109                     of either spaces and tabs or the values defined by the  a
110                     custom   function  registered  via  csv_set_space_func().
111                     Added in 3.0.3.
112
113       Multiple options can be specified by OR-ing them together.
114
115       csv_parse() is the function that does the actual parsing,  it  takes  6
116       arguments:
117
118              p is a pointer to an initialized struct csv_parser.
119
120              s  is  a  pointer  to the data to read in, such as a dynamically
121              allocated region of memory containing data read in from  a  call
122              to fread().
123
124              len is the number of bytes of data to process.
125
126              cb1  is  a  pointer to the callback function that will be called
127              from csv_parse() after an entire field has been read.  cb1  will
128              be  called  with a pointer to the parsed data (which is NOT nul-
129              terminated unless the CSV_APPEND_NULL option is set), the number
130              of  bytes  in  the  data,  and  the  pointer  that was passed to
131              csv_parse().
132
133              cb2 is a pointer to the callback function that  will  be  called
134              when  the end of a record is encountered, it will be called with
135              the character that caused the record to end, cast to an unsigned
136              char,  or  -1  if called from csv_fini, and the pointer that was
137              passed to csv_init().
138
139              data is a pointer to user-defined data that will  be  passed  to
140              the callback functions when invoked.
141
142              cb1  and/or  cb2  may  be NULL in which case no function will be
143              called for the associated actions.  data may also  be  NULL  but
144              the  callback  functions  must be prepared to handle receiving a
145              null pointer.
146
147       By default cb2 is not called when rows that do not contain  any  fields
148       are encountered.  This behavior is meant to accomodate files using only
149       either a linefeed or a carriage return as  a  record  seperator  to  be
150       parsed  properly  while at the same time being able to parse files with
151       rows terminated by multiple characters from  resulting  in  blank  rows
152       after  each actual row of data (for example, processing a text CSV file
153       created that was created on a Windows machine on a Unix machine).   The
154       CSV_REPALL_NL  option  will  cause cb2 to be called once for every car‐
155       raige return or linefeed encountered outside of a field.  cb2 is called
156       with the character that prompted the call to the function, , cast to an
157       unsigned char, either CSV_CR for carriage return, CSV_LF for  linefeed,
158       or  -1 for record termination from a call to csv_fini() (see below).  A
159       carriage return or linefeed within a non-quoted field always marks both
160       the  end of the field and the row.  Other characters can be used as row
161       terminators  and  thus  be  provided  as  an  argument  to  cb2   using
162       csv_set_space_func().
163
164       Note: The first parameter of the cb1 function is void *, not const void
165       *; the pointer passed to the callback function is actually a pointer to
166       the  entry buffer inside the csv_parser struct, this data may safely be
167       modified from the callback function (or any function that the  callback
168       function  calls) but you must not attempt to access more than len bytes
169       and you should not access the data after the callback function  returns
170       as  the  buffer  is dynamically allocated and its location and size may
171       change during calls to csv_parse().
172
173       Note: Different callback functions may safely be specified during  each
174       call to csv_parse() but keep in mind that the callback functions may be
175       called many times during a single call to csv_parse() depending on  the
176       amount of data being processed in a given call.
177
178       csv_parse() returns the number of bytes processed, on a successful call
179       this will be len, if it is less than len  an  error  has  occured.   An
180       error  can occur, for example, if there is insufficient memory to store
181       the contents of the current field in the entry buffer.   An  error  can
182       also  occur  if  malformed  data is encountered while running in strict
183       mode.
184
185       The csv_error() function can be used to determine what the error is and
186       the  csv_strerror()  function can be used to provide a textual descrip‐
187       tion of the error. csv_error() takes a single argument, a pointer to  a
188       struct  csv_parser,  and returns one of the following values defined in
189       csv.h:
190
191              CSV_EPARSE   A parse error has occured while in strict mode
192
193              CSV_ENOMEM   There was not enough  memory  while  attempting  to
194              increase the entry buffer for the current field
195
196              CSV_ETOOBIG  Continuing  to  process  the  current  field  would
197              require a buffer of more than SIZE_MAX bytes
198
199       The  value  passed  to  csv_strerror()  should  be  one  returned  from
200       csv_error().   The  return  value  of  csv_strerror() is a pointer to a
201       static string. The pointer may be used for the entire lifetime  of  the
202       program  and the contents will not change during execution but you must
203       not attempt to modify the string it points to.
204
205       When you have finished submitting data to csv_parse(), you need to call
206       the csv_fini() function.  This function will call the cb1 function with
207       any remaining data in the entry buffer (if there is any) and  call  the
208       cb2  function  unless we are already at the end of a row (the last byte
209       processed was a newline character for example).  It  is  neccessary  to
210       call  this function because the file being processed might not end with
211       a carriage return or newline but the data that has been read in to this
212       point  still needs to be submitted to the callback routines.  If cb2 is
213       called from within csv_fini() it will be because the row was not termi‐
214       nated  with a newline sequence, in this case cb2 will be called with an
215       argument of -1.
216
217       Note: A call to csv_fini implicitly ends the field  current  field  and
218       row.   If the last field processed is a quoted field that ends before a
219       closing quote is encountered, no error will  be  reported  by  default,
220       even  if  CSV_STRICT  is  specified.   To cause csv_fini() to report an
221       error in such a case, set the CSV_STRICT_FINI option  (new  in  version
222       1.0.1) in addition to the CSV_STRICT option.
223
224       csv_fini()  also  reinitializes the parser state so that it is ready to
225       be used on the next file or set of data.  csv_fini() does not alter the
226       current buffer size. If the last set of data that was being parsed con‐
227       tained a very large field that increased the size of  the  buffer,  and
228       you  need  to  free  that  memory  before  continuing,  you  must  call
229       csv_free(), you do not need to call csv_init() again after  csv_free().
230       Like  csv_parse,  the  callback functions provided to csv_fini() may be
231       NULL.  csv_fini() returns 0 on success and a non-zero value if you pass
232       it a null pointer.
233
234       After  calling  csv_fini()  you  may  continue  to  use the same struct
235       csv_parser pointer without reinitializing it (in fact you must not call
236       csv_init()  with  an  initialized csv_parser object or the memory allo‐
237       cated for the original structure will be lost).
238
239       When you are finished using the csv_parser  object  you  can  free  any
240       dynamically  allocated memory associated with it by calling csv_free().
241       You may call csv_free() at any time, it need not be preceded by a  call
242       to  csv_fini().   You  must only call csv_free() on a csv_parser object
243       that has been initialized with a successful call to csv_init().
244
245   WRITING DATA
246       libcsv provides two functions to transform raw data into CSV  formatted
247       data:  the  csv_write()  function which writes the result to a provided
248       buffer, and the csv_fwrite() function which  writes  the  result  to  a
249       file.   The  functionality  of both functions is straight-forward, they
250       write out a single field including the opening and closing  quotes  and
251       escape each encountered quote with another quote.
252
253       The  csv_write()  function takes a pointer to a source buffer (src) and
254       processes at most src_size characters from src.  csv_write() will write
255       at  most dest_size characters to dest and returns the number of charac‐
256       ters that would have been written if dest was large enough.   This  can
257       be  used  to  determine if all the characters were written and, if not,
258       how large dest needs to be to write out all of the  data.   csv_write()
259       may  be  called with a null pointer for the dest argument in which case
260       no data is written but the size required to write out the data will  be
261       returned.   The  space  needed to write out the data is the size of the
262       data + number of quotes appearing in data (each one will be escaped)  +
263       2  (the  leading and terminating quotes).  csv_write() and csv_fwrite()
264       always surround the output data with quotes.  If src_size is very large
265       (SIZE_MAX/2  or greater) it is possible that the number of bytes needed
266       to represent the data, after inserting escaping quotes, will be greater
267       than  SIZE_MAX.   In  such a case, csv_write will return SIZE_MAX which
268       should be interpreted as meaning the data is too large to  write  to  a
269       single field.  The csv_fwrite() function is not similiarly limited.
270
271       csv_fwrite()  takes  a  FILE  pointer (which should have been opened in
272       binary mode) and converts and writes the data pointed to by src of size
273       src_size.   It returns 0 on success and EOF if there was an error writ‐
274       ing to the file.  csv_fwrite() doesn't provide the number of characters
275       processed  or  written.   If  this  functionality  is required, use the
276       csv_write() function combined with fwrite().
277
278       csv_write2() and csv_fwrite2() work similiarly but take  an  additional
279       argument, the quote character to use when composing the field.
280
281   CUSTOMIZING THE PARSER
282       The  csv_set_delim()  and  csv_set_quote() functions provide a means to
283       change the characters that the parser will consider the  delimiter  and
284       quote  characters  respetively, cast to unsigned char.  csv_get_delim()
285       and csv_get_delim() return the current delimiter and  quote  characters
286       respectively.   When  csv_init()  is  called  the  delimiter  is set to
287       CSV_COMMA and the quote to CSV_QUOTE.  Note that the rest  of  the  CSV
288       conventions  still  apply  when  these functions are used to change the
289       delimiter and/or quote characters,  fields  containing  the  new  quote
290       character  or  delimiter  must  be  quoted and quote characters must be
291       escaped with an immediately preceeding instance of the same  character.
292       Additionally,  the csv_set_space_func() and csv_set_term_func() allow a
293       user-defined function to be provided which will be used determine  what
294       constitutes  a space character and what constitutes a record terminator
295       character.  The space characters determine which characters are removed
296       from  the  beginning  and  end  of non-quoted fields and the terminator
297       characters govern when a record ends.  When csv_init() is  called,  the
298       effect  is  as if these functions were each called with a NULL argument
299       in which case no function is called and CSV_SPACE and CSV_TAB are  used
300       for  space  characters,  and  CSV_CR and CSV_LF are used for terminator
301       characters.
302
303       csv_set_realloc_func() can be used to set the function that  is  called
304       when the internal buffer needs to be resized, only realloc, not malloc,
305       is used internally; the default is to use the  standard  realloc  func‐
306       tion.  Likewise, csv_set_free_func() is used to set the function called
307       to free the internal buffer, the default is the standard free function.
308
309       csv_get_blk_size() and csv_set_blk_size() can be used to  get  and  set
310       the  block  size  of  the  parser  respectively.  The block size if the
311       amount of extra memory allocated every time the internal  buffer  needs
312       to be increased, the default is 128.  csv_get_buffer_size() will return
313       the current number of bytes allocated for the internal buffer.
314
315

THE CSV FORMAT

317       Although quite prevelant there is  no  standard  for  the  CSV  format.
318       There are however, a set of traditional conventions used by many appli‐
319       cations.  libcsv follows the conventions described  at  http://www.cre‐
320       ativyst.com/Doc/Articles/CSV/CSV01.htm  which  seem to reflect the most
321       common usage of the format, namely:
322
323              Fields are seperated with commas.
324
325              Rows are delimited by newline sequences (see below).
326
327              Fields may be surrounded with quotes.
328
329              Fields that contain comma, quote, or newline characters MUST  be
330              quoted.
331
332              Each instance of a quote character must be escaped with an imme‐
333              diately preceding quote character.
334
335              Leading and trailing spaces and tabs are removed from non-quoted
336              fields.
337
338              The final line need not contain a newline sequence.
339
340       In  strict  mode, any detectable violation of these rules results in an
341       error.
342
343       RFC 4180 is an informational memo which attempts to  document  the  CSV
344       format, especially with regards to its use as a MIME type.  There are a
345       several parts of the description documented in this memo  which  either
346       do not accurately reflect widely used conventions or artificially limit
347       the usefulness of the format.  The  differences  between  the  RFC  and
348       libcsv are:
349
350              "Each  line  should contain the same number of fields throughout
351              the file"
352                     libcsv doesn't care if every record contains a  different
353                     number  of  fields,  such  a  restriction could easily be
354                     enforced by the application itself if desired.
355
356              "Spaces are considered  part  of  a  field  and  should  not  be
357              ignored"
358                     Leading  and  trailing spaces that are part of non-quoted
359                     fields are ignored as this is  by  far  the  most  common
360                     behavior and expected by many applications.
361
362                     abc ,  def
363
364                     is considered equivalent to:
365
366                     "abc", "def"
367
368              "The last field in the record must not be followed by a comma"
369                     The  meaning  of  this  statement is not clear but if the
370                     last character of a record is a comma, libcsv will inter‐
371                     pret that as a final empty field, i.e.:
372
373                     "abc", "def",
374
375                     will be interpreted as 3 fields, equivalent to:
376
377                     "abc", "def", ""
378
379              RFC  4180 limits the allowable characters in a CSV field, libcsv
380              allows any character to  be  present  in  a  field  provided  it
381              adheres  to the conventions mentioned above.  This makes it pos‐
382              sible to store binary data in CSV format, an attribute that many
383              application rely on.
384
385              RFC 4180 states that a Carriage Return plus Linefeed combination
386              is used to delimit records, libcsv  allows  any  combination  of
387              Carriage  Returns  and Linefeeds to signify the end of a record.
388              This is to increase portability among systems that use different
389              combinations to denote a newline sequence.
390

PARSING MALFORMED DATA

392       libcsv  should  correctly parse any CSV data that conforms to the rules
393       discussed above.  By default, however,  libcsv  will  also  attempt  to
394       parse  malformed  CSV  data such as data containing unescaped quotes or
395       quotes within non-quoted fields.  For example:
396
397       a"c, "d"f"
398
399       would be parsed equivalently to the correct form:
400
401       "a""c", "d""f"
402
403       This is often desirable as there are  some  applications  that  do  not
404       adhere  to the specifications previously discussed.  However, there are
405       instances where malformed CSV data is ambigious, namely when a comma or
406       newline is the next non-space character following a quote such as:
407
408       "Sally said "Hello", Wally said "Goodbye""
409
410       This could either be parsed as a single field containing the data:
411
412       Sally said "Hello", Wally said "Goodbye"
413
414       or as 2 seperate fields:
415
416       Sally said "Hello and Wally said "Goodbye""
417
418       Since  the  data  is  malformed,  there  is no way to know if the quote
419       before the comma is meant to be a literal quote or if it signifies  the
420       end  of  the field.  This is of course not an issue for properly formed
421       data as all quotes must be escaped.  libcsv will parse this example  as
422       2 seperate fields.
423
424       libcsv  provides a strict mode that will return with a parse error if a
425       quote is seen inside a non-quoted field or if a  non-escaped  quote  is
426       seen whose next non-space character isn't a comma or newline sequence.
427
428

PARSER DETAILS

430       A field is considered quoted if the first non-space character for a new
431       field is a quote.
432
433       If a quote is encountered in a quoted  field  and  the  next  non-space
434       character  is a comma, the field ends at the closed quote and the field
435       data is submitted when the comma is encountered.  If the next non-space
436       character  after  a quote is a newline character, the row has ended and
437       the field data is submitted and the end of row is  signalled  (via  the
438       appropriate  callback  function).   If two quotes are immediately adja‐
439       cent, the first one is interpreted as escaping the second one  and  one
440       quote  is written to the field buffer.  If the next non-space character
441       following a quote is anything else, the quote is interpreted as a  non-
442       escaped  literal quote and it and what follows are written to the field
443       buffer, this would cause a parse error in strict mode.
444
445       Example 1
446       "abc"""
447       Parses as: abc"
448       The first quote marks the field as quoted, the second quote escapes the
449       following  quote  and  the last quote ends the field.  This is valid in
450       both strict and non-strict modes.
451
452       Example 2
453       "ab"c
454       Parses as: ab"c
455       The first qute marks the field as quoted, the second quote is taken  as
456       a  literal  quote since the next non-space character is not a comma, or
457       newline and the quote is not escaped.  The last quote  ends  the  field
458       (assuming there is a newline character following).  A parse error would
459       result upon seeing the character c in strict mode.
460
461       Example 3
462       "abc" "
463       Parses as: abc"
464       In this case, since the next non-space character following  the  second
465       quote  is not a comma or newline character, a literal quote is written,
466       the space character after is part of the field, and the last quote ter‐
467       minated  the field.  This demonstrates the fact that a quote must imme‐
468       diately precede another quote to escape it.  This would  be  a  strict-
469       mode violation as all quotes are required to be escaped.
470
471       If the field is not quoted, any quote character is taken as part of the
472       field data, any comma terminated the field, and any  newline  character
473       terminated the field and the record.
474
475       Example 4
476       ab""c
477       Parses as: ab""c
478       Quotes  are not considered special in non-quoted fields.  This would be
479       a strict mode violation since quotes may not exist in non-quoted fields
480       in strict mode.
481
482

EXAMPLES

484       The  following  example prints the number of fields and rows in a file.
485       This is a simplified version of the csvinfo  program  provided  in  the
486       examples  directory.   Error  checking  not  related to libcsv has been
487       removed for clarity, the csvinfo program also provides  an  option  for
488       enabling strict mode and handles multiple files.
489
490              #include <stdio.h>
491              #include <string.h>
492              #include <errno.h>
493              #include <stdlib.h>
494              #include "libcsv/csv.h"
495
496              struct counts {
497                long unsigned fields;
498                long unsigned rows;
499              };
500
501              void cb1 (void *s, size_t len, void *data) {
502                ((struct counts *)data)->fields++; }
503              void cb2 (int c, void *data) {
504                ((struct counts *)data)->rows++; }
505
506              int main (int argc, char *argv[]) {
507                FILE *fp;
508                struct csv_parser p;
509                char buf[1024];
510                size_t bytes_read;
511                struct counts c = {0, 0};
512
513                if (csv_init(&p, 0) != 0) exit(EXIT_FAILURE);
514                fp = fopen(argv[1], "rb");
515                if (!fp) exit(EXIT_FAILURE);
516
517                while ((bytes_read=fread(buf, 1, 1024, fp)) > 0)
518                  if (csv_parse(&p, buf, bytes_read, cb1, cb2, &c) != bytes_read) {
519                    fprintf(stderr, "Error while parsing file: %s\n",
520                    csv_strerror(csv_error(&p)) );
521                    exit(EXIT_FAILURE);
522                  }
523
524                csv_fini(&p, cb1, cb2, &c);
525
526                fclose(fp);
527                printf("%lu fields, %lu rows\n", c.fields, c.rows);
528
529                csv_free(&p);
530                exit(EXIT_SUCCESS);
531              }
532
533       See the examples directory for several complete example programs.
534
535

AUTHOR

537       Written by Robert Gamble.
538
539

BUGS

541       Please send questions, comments, bugs, etc. to:
542
543               rgamble@users.sourceforge.net
544
545
546
547                                9 January 2013                          CSV(3)