1CSV(3) Library Functions Manual CSV(3)
2
3
4
6 csv - CSV parser and writer library
7
9 #include <libcsv/csv.h>
10
11 int csv_init(struct csv_parser *p, unsigned char options);
12 size_t csv_parse(struct csv_parser *p,
13 const void *s,
14 size_t len,
15 void (*cb1)(void *, size_t, void *),
16 void (*cb2)(int, void *),
17 void *data);
18 int csv_fini(struct csv_parser *p,
19 void (*cb1)(void *, size_t, void *),
20 void (*cb2)(int, void *),
21 void *data);
22 void csv_free(struct csv_parser *p);
23
24 unsigned char csv_get_delim(struct csv_parser *p);
25 unsigned char csv_get_quote(struct csv_parser *p);
26 void csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char));
27 void csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char));
28
29 int csv_get_opts(struct csv_parser *p);
30 int csv_set_opts(struct csv_parser *p, unsigned char options);
31 int csv_error(struct csv_parser *p);
32 char * csv_strerror(int error);
33
34 size_t csv_write(void *dest, size_t dest_size, const void *src,
35 size_t src_size);
36 int csv_fwrite(FILE *fp, const void *src, size_t src_size);
37
38 size_t csv_write2(void *dest, size_t dest_size, const void *src,
39 size_t src_size, unsigned char quote);
40 int csv_fwrite2(FILE *fp, const void *src, size_t src_size, unsigned char quote);
41
42 void csv_set_realloc_func(struct csv_parser *p, void *(*func)(void *, size_t));
43 void csv_set_free_func(struct csv_parser *p, void (*func)(void *));
44 void csv_set_blk_size(struct csv_parser *p, size_t size);
45 size_t csv_get_blk_size(struct csv_parser *p);
46 size_t csv_get_buffer_size(struct csv_parser *p);
47
48
50 The CSV library provides a flexible, intuitive interface for parsing
51 and writing csv data.
52
53
55 The idea behind parsing with libcsv is straight-forward: you initialize
56 a parser object with csv_init() and feed data to the parser over one or
57 more calls to csv_parse() providing callback functions that handle end-
58 of-field and end-of-row events. csv_parse() parses the data provided
59 calling the user-defined callback functions as it reads fields and
60 rows. When complete, csv_fini() is called to finish processing the
61 current field and make a final call to the callback functions if nec‐
62 cessary. csv_free() is then called to free the parser object.
63 csv_error() and csv_strerror() provide information about errors encoun‐
64 tered by the functions. csv_write() and csv_fwrite() provide a simple
65 interface for converting raw data into CSV data and storing the result
66 into a buffer or file respectively.
67
68 CSV is a binary format allowing the storage of arbitrary binary data,
69 files opened for reading or writing CSV data should be opened in binary
70 mode.
71
72 libcsv provides a default mode in which the parser will happily process
73 any data as CSV without complaint, this is useful for parsing files
74 which don't adhere to all the traditional rules. A strict mode is also
75 supported which will cause any violation of the imposed rules to cause
76 a parsing failure.
77
78
80 PARSING DATA
81 csv_init() initializes a pointer to a csv_parser structure. This
82 structure contains housekeeping information such as the current state
83 of the parser, the buffer, current size and position, etc. The
84 csv_init() function returns 0 on success and a non-zero value upon
85 failure. csv_init() will fail if the pointer passed to it is a null
86 pointer. The options argument specifies the parser options, these may
87 be changed later with the csv_set_opts() function.
88
89 OPTIONS
90
91 CSV_STRICT
92 Enables strict mode.
93
94 CSV_REPALL_NL
95 Causes each instance of a carriage return or linefeed
96 outside of a record to be reported.
97
98 CSV_STRICT_FINI
99 Causes unterminated quoted fields encountered in
100 csv_fini() to cause a parsing error (see below).
101
102 CSV_APPEND_NULL
103 Will cause all fields to be nul-terminated when provided
104 to cb1, introduced in 3.0.0.
105
106 CSV_EMPTY_IS_NULL
107 Will cause NULL to be passed as the first argument to cb1
108 for empty, unquoted, fields. Empty means consisting only
109 of either spaces and tabs or the values defined by the a
110 custom function registered via csv_set_space_func().
111 Added in 3.0.3.
112
113 Multiple options can be specified by OR-ing them together.
114
115 csv_parse() is the function that does the actual parsing, it takes 6
116 arguments:
117
118 p is a pointer to an initialized struct csv_parser.
119
120 s is a pointer to the data to read in, such as a dynamically
121 allocated region of memory containing data read in from a call
122 to fread().
123
124 len is the number of bytes of data to process.
125
126 cb1 is a pointer to the callback function that will be called
127 from csv_parse() after an entire field has been read. cb1 will
128 be called with a pointer to the parsed data (which is NOT nul-
129 terminated unless the CSV_APPEND_NULL option is set), the number
130 of bytes in the data, and the pointer that was passed to
131 csv_parse().
132
133 cb2 is a pointer to the callback function that will be called
134 when the end of a record is encountered, it will be called with
135 the character that caused the record to end, cast to an unsigned
136 char, or -1 if called from csv_fini, and the pointer that was
137 passed to csv_init().
138
139 data is a pointer to user-defined data that will be passed to
140 the callback functions when invoked.
141
142 cb1 and/or cb2 may be NULL in which case no function will be
143 called for the associated actions. data may also be NULL but
144 the callback functions must be prepared to handle receiving a
145 null pointer.
146
147 By default cb2 is not called when rows that do not contain any fields
148 are encountered. This behavior is meant to accomodate files using only
149 either a linefeed or a carriage return as a record seperator to be
150 parsed properly while at the same time being able to parse files with
151 rows terminated by multiple characters from resulting in blank rows
152 after each actual row of data (for example, processing a text CSV file
153 created that was created on a Windows machine on a Unix machine). The
154 CSV_REPALL_NL option will cause cb2 to be called once for every car‐
155 raige return or linefeed encountered outside of a field. cb2 is called
156 with the character that prompted the call to the function, , cast to an
157 unsigned char, either CSV_CR for carriage return, CSV_LF for linefeed,
158 or -1 for record termination from a call to csv_fini() (see below). A
159 carriage return or linefeed within a non-quoted field always marks both
160 the end of the field and the row. Other characters can be used as row
161 terminators and thus be provided as an argument to cb2 using
162 csv_set_space_func().
163
164 Note: The first parameter of the cb1 function is void *, not const void
165 *; the pointer passed to the callback function is actually a pointer to
166 the entry buffer inside the csv_parser struct, this data may safely be
167 modified from the callback function (or any function that the callback
168 function calls) but you must not attempt to access more than len bytes
169 and you should not access the data after the callback function returns
170 as the buffer is dynamically allocated and its location and size may
171 change during calls to csv_parse().
172
173 Note: Different callback functions may safely be specified during each
174 call to csv_parse() but keep in mind that the callback functions may be
175 called many times during a single call to csv_parse() depending on the
176 amount of data being processed in a given call.
177
178 csv_parse() returns the number of bytes processed, on a successful call
179 this will be len, if it is less than len an error has occured. An
180 error can occur, for example, if there is insufficient memory to store
181 the contents of the current field in the entry buffer. An error can
182 also occur if malformed data is encountered while running in strict
183 mode.
184
185 The csv_error() function can be used to determine what the error is and
186 the csv_strerror() function can be used to provide a textual descrip‐
187 tion of the error. csv_error() takes a single argument, a pointer to a
188 struct csv_parser, and returns one of the following values defined in
189 csv.h:
190
191 CSV_EPARSE A parse error has occured while in strict mode
192
193 CSV_ENOMEM There was not enough memory while attempting to
194 increase the entry buffer for the current field
195
196 CSV_ETOOBIG Continuing to process the current field would
197 require a buffer of more than SIZE_MAX bytes
198
199 The value passed to csv_strerror() should be one returned from
200 csv_error(). The return value of csv_strerror() is a pointer to a
201 static string. The pointer may be used for the entire lifetime of the
202 program and the contents will not change during execution but you must
203 not attempt to modify the string it points to.
204
205 When you have finished submitting data to csv_parse(), you need to call
206 the csv_fini() function. This function will call the cb1 function with
207 any remaining data in the entry buffer (if there is any) and call the
208 cb2 function unless we are already at the end of a row (the last byte
209 processed was a newline character for example). It is neccessary to
210 call this function because the file being processed might not end with
211 a carriage return or newline but the data that has been read in to this
212 point still needs to be submitted to the callback routines. If cb2 is
213 called from within csv_fini() it will be because the row was not termi‐
214 nated with a newline sequence, in this case cb2 will be called with an
215 argument of -1.
216
217 Note: A call to csv_fini implicitly ends the field current field and
218 row. If the last field processed is a quoted field that ends before a
219 closing quote is encountered, no error will be reported by default,
220 even if CSV_STRICT is specified. To cause csv_fini() to report an
221 error in such a case, set the CSV_STRICT_FINI option (new in version
222 1.0.1) in addition to the CSV_STRICT option.
223
224 csv_fini() also reinitializes the parser state so that it is ready to
225 be used on the next file or set of data. csv_fini() does not alter the
226 current buffer size. If the last set of data that was being parsed con‐
227 tained a very large field that increased the size of the buffer, and
228 you need to free that memory before continuing, you must call
229 csv_free(), you do not need to call csv_init() again after csv_free().
230 Like csv_parse, the callback functions provided to csv_fini() may be
231 NULL. csv_fini() returns 0 on success and a non-zero value if you pass
232 it a null pointer.
233
234 After calling csv_fini() you may continue to use the same struct
235 csv_parser pointer without reinitializing it (in fact you must not call
236 csv_init() with an initialized csv_parser object or the memory allo‐
237 cated for the original structure will be lost).
238
239 When you are finished using the csv_parser object you can free any
240 dynamically allocated memory associated with it by calling csv_free().
241 You may call csv_free() at any time, it need not be preceded by a call
242 to csv_fini(). You must only call csv_free() on a csv_parser object
243 that has been initialized with a successful call to csv_init().
244
245 WRITING DATA
246 libcsv provides two functions to transform raw data into CSV formatted
247 data: the csv_write() function which writes the result to a provided
248 buffer, and the csv_fwrite() function which writes the result to a
249 file. The functionality of both functions is straight-forward, they
250 write out a single field including the opening and closing quotes and
251 escape each encountered quote with another quote.
252
253 The csv_write() function takes a pointer to a source buffer (src) and
254 processes at most src_size characters from src. csv_write() will write
255 at most dest_size characters to dest and returns the number of charac‐
256 ters that would have been written if dest was large enough. This can
257 be used to determine if all the characters were written and, if not,
258 how large dest needs to be to write out all of the data. csv_write()
259 may be called with a null pointer for the dest argument in which case
260 no data is written but the size required to write out the data will be
261 returned. The space needed to write out the data is the size of the
262 data + number of quotes appearing in data (each one will be escaped) +
263 2 (the leading and terminating quotes). csv_write() and csv_fwrite()
264 always surround the output data with quotes. If src_size is very large
265 (SIZE_MAX/2 or greater) it is possible that the number of bytes needed
266 to represent the data, after inserting escaping quotes, will be greater
267 than SIZE_MAX. In such a case, csv_write will return SIZE_MAX which
268 should be interpreted as meaning the data is too large to write to a
269 single field. The csv_fwrite() function is not similiarly limited.
270
271 csv_fwrite() takes a FILE pointer (which should have been opened in
272 binary mode) and converts and writes the data pointed to by src of size
273 src_size. It returns 0 on success and EOF if there was an error writ‐
274 ing to the file. csv_fwrite() doesn't provide the number of characters
275 processed or written. If this functionality is required, use the
276 csv_write() function combined with fwrite().
277
278 csv_write2() and csv_fwrite2() work similiarly but take an additional
279 argument, the quote character to use when composing the field.
280
281 CUSTOMIZING THE PARSER
282 The csv_set_delim() and csv_set_quote() functions provide a means to
283 change the characters that the parser will consider the delimiter and
284 quote characters respetively, cast to unsigned char. csv_get_delim()
285 and csv_get_delim() return the current delimiter and quote characters
286 respectively. When csv_init() is called the delimiter is set to
287 CSV_COMMA and the quote to CSV_QUOTE. Note that the rest of the CSV
288 conventions still apply when these functions are used to change the
289 delimiter and/or quote characters, fields containing the new quote
290 character or delimiter must be quoted and quote characters must be
291 escaped with an immediately preceeding instance of the same character.
292 Additionally, the csv_set_space_func() and csv_set_term_func() allow a
293 user-defined function to be provided which will be used determine what
294 constitutes a space character and what constitutes a record terminator
295 character. The space characters determine which characters are removed
296 from the beginning and end of non-quoted fields and the terminator
297 characters govern when a record ends. When csv_init() is called, the
298 effect is as if these functions were each called with a NULL argument
299 in which case no function is called and CSV_SPACE and CSV_TAB are used
300 for space characters, and CSV_CR and CSV_LF are used for terminator
301 characters.
302
303 csv_set_realloc_func() can be used to set the function that is called
304 when the internal buffer needs to be resized, only realloc, not malloc,
305 is used internally; the default is to use the standard realloc func‐
306 tion. Likewise, csv_set_free_func() is used to set the function called
307 to free the internal buffer, the default is the standard free function.
308
309 csv_get_blk_size() and csv_set_blk_size() can be used to get and set
310 the block size of the parser respectively. The block size if the
311 amount of extra memory allocated every time the internal buffer needs
312 to be increased, the default is 128. csv_get_buffer_size() will return
313 the current number of bytes allocated for the internal buffer.
314
315
317 Although quite prevelant there is no standard for the CSV format.
318 There are however, a set of traditional conventions used by many appli‐
319 cations. libcsv follows the conventions described at http://www.cre‐
320 ativyst.com/Doc/Articles/CSV/CSV01.htm which seem to reflect the most
321 common usage of the format, namely:
322
323 Fields are seperated with commas.
324
325 Rows are delimited by newline sequences (see below).
326
327 Fields may be surrounded with quotes.
328
329 Fields that contain comma, quote, or newline characters MUST be
330 quoted.
331
332 Each instance of a quote character must be escaped with an imme‐
333 diately preceding quote character.
334
335 Leading and trailing spaces and tabs are removed from non-quoted
336 fields.
337
338 The final line need not contain a newline sequence.
339
340 In strict mode, any detectable violation of these rules results in an
341 error.
342
343 RFC 4180 is an informational memo which attempts to document the CSV
344 format, especially with regards to its use as a MIME type. There are a
345 several parts of the description documented in this memo which either
346 do not accurately reflect widely used conventions or artificially limit
347 the usefulness of the format. The differences between the RFC and
348 libcsv are:
349
350 "Each line should contain the same number of fields throughout
351 the file"
352 libcsv doesn't care if every record contains a different
353 number of fields, such a restriction could easily be
354 enforced by the application itself if desired.
355
356 "Spaces are considered part of a field and should not be
357 ignored"
358 Leading and trailing spaces that are part of non-quoted
359 fields are ignored as this is by far the most common
360 behavior and expected by many applications.
361
362 abc , def
363
364 is considered equivalent to:
365
366 "abc", "def"
367
368 "The last field in the record must not be followed by a comma"
369 The meaning of this statement is not clear but if the
370 last character of a record is a comma, libcsv will inter‐
371 pret that as a final empty field, i.e.:
372
373 "abc", "def",
374
375 will be interpreted as 3 fields, equivalent to:
376
377 "abc", "def", ""
378
379 RFC 4180 limits the allowable characters in a CSV field, libcsv
380 allows any character to be present in a field provided it
381 adheres to the conventions mentioned above. This makes it pos‐
382 sible to store binary data in CSV format, an attribute that many
383 application rely on.
384
385 RFC 4180 states that a Carriage Return plus Linefeed combination
386 is used to delimit records, libcsv allows any combination of
387 Carriage Returns and Linefeeds to signify the end of a record.
388 This is to increase portability among systems that use different
389 combinations to denote a newline sequence.
390
392 libcsv should correctly parse any CSV data that conforms to the rules
393 discussed above. By default, however, libcsv will also attempt to
394 parse malformed CSV data such as data containing unescaped quotes or
395 quotes within non-quoted fields. For example:
396
397 a"c, "d"f"
398
399 would be parsed equivalently to the correct form:
400
401 "a""c", "d""f"
402
403 This is often desirable as there are some applications that do not
404 adhere to the specifications previously discussed. However, there are
405 instances where malformed CSV data is ambigious, namely when a comma or
406 newline is the next non-space character following a quote such as:
407
408 "Sally said "Hello", Wally said "Goodbye""
409
410 This could either be parsed as a single field containing the data:
411
412 Sally said "Hello", Wally said "Goodbye"
413
414 or as 2 seperate fields:
415
416 Sally said "Hello and Wally said "Goodbye""
417
418 Since the data is malformed, there is no way to know if the quote
419 before the comma is meant to be a literal quote or if it signifies the
420 end of the field. This is of course not an issue for properly formed
421 data as all quotes must be escaped. libcsv will parse this example as
422 2 seperate fields.
423
424 libcsv provides a strict mode that will return with a parse error if a
425 quote is seen inside a non-quoted field or if a non-escaped quote is
426 seen whose next non-space character isn't a comma or newline sequence.
427
428
430 A field is considered quoted if the first non-space character for a new
431 field is a quote.
432
433 If a quote is encountered in a quoted field and the next non-space
434 character is a comma, the field ends at the closed quote and the field
435 data is submitted when the comma is encountered. If the next non-space
436 character after a quote is a newline character, the row has ended and
437 the field data is submitted and the end of row is signalled (via the
438 appropriate callback function). If two quotes are immediately adja‐
439 cent, the first one is interpreted as escaping the second one and one
440 quote is written to the field buffer. If the next non-space character
441 following a quote is anything else, the quote is interpreted as a non-
442 escaped literal quote and it and what follows are written to the field
443 buffer, this would cause a parse error in strict mode.
444
445 Example 1
446 "abc"""
447 Parses as: abc"
448 The first quote marks the field as quoted, the second quote escapes the
449 following quote and the last quote ends the field. This is valid in
450 both strict and non-strict modes.
451
452 Example 2
453 "ab"c
454 Parses as: ab"c
455 The first qute marks the field as quoted, the second quote is taken as
456 a literal quote since the next non-space character is not a comma, or
457 newline and the quote is not escaped. The last quote ends the field
458 (assuming there is a newline character following). A parse error would
459 result upon seeing the character c in strict mode.
460
461 Example 3
462 "abc" "
463 Parses as: abc"
464 In this case, since the next non-space character following the second
465 quote is not a comma or newline character, a literal quote is written,
466 the space character after is part of the field, and the last quote ter‐
467 minated the field. This demonstrates the fact that a quote must imme‐
468 diately precede another quote to escape it. This would be a strict-
469 mode violation as all quotes are required to be escaped.
470
471 If the field is not quoted, any quote character is taken as part of the
472 field data, any comma terminated the field, and any newline character
473 terminated the field and the record.
474
475 Example 4
476 ab""c
477 Parses as: ab""c
478 Quotes are not considered special in non-quoted fields. This would be
479 a strict mode violation since quotes may not exist in non-quoted fields
480 in strict mode.
481
482
484 The following example prints the number of fields and rows in a file.
485 This is a simplified version of the csvinfo program provided in the
486 examples directory. Error checking not related to libcsv has been
487 removed for clarity, the csvinfo program also provides an option for
488 enabling strict mode and handles multiple files.
489
490 #include <stdio.h>
491 #include <string.h>
492 #include <errno.h>
493 #include <stdlib.h>
494 #include "libcsv/csv.h"
495
496 struct counts {
497 long unsigned fields;
498 long unsigned rows;
499 };
500
501 void cb1 (void *s, size_t len, void *data) {
502 ((struct counts *)data)->fields++; }
503 void cb2 (int c, void *data) {
504 ((struct counts *)data)->rows++; }
505
506 int main (int argc, char *argv[]) {
507 FILE *fp;
508 struct csv_parser p;
509 char buf[1024];
510 size_t bytes_read;
511 struct counts c = {0, 0};
512
513 if (csv_init(&p, 0) != 0) exit(EXIT_FAILURE);
514 fp = fopen(argv[1], "rb");
515 if (!fp) exit(EXIT_FAILURE);
516
517 while ((bytes_read=fread(buf, 1, 1024, fp)) > 0)
518 if (csv_parse(&p, buf, bytes_read, cb1, cb2, &c) != bytes_read) {
519 fprintf(stderr, "Error while parsing file: %s\n",
520 csv_strerror(csv_error(&p)) );
521 exit(EXIT_FAILURE);
522 }
523
524 csv_fini(&p, cb1, cb2, &c);
525
526 fclose(fp);
527 printf("%lu fields, %lu rows\n", c.fields, c.rows);
528
529 csv_free(&p);
530 exit(EXIT_SUCCESS);
531 }
532
533 See the examples directory for several complete example programs.
534
535
537 Written by Robert Gamble.
538
539
541 Please send questions, comments, bugs, etc. to:
542
543 rgamble@users.sourceforge.net
544
545
546
547 9 January 2013 CSV(3)