1/*************************************************
2*           PCRE DEMONSTRATION PROGRAM           *
3*************************************************/
4
5/* This is a demonstration program to illustrate the most straightforward ways
6of calling the PCRE regular expression library from a C program. See the
7pcresample documentation for a short discussion ("man pcresample" if you have
8the PCRE man pages installed).
9
10In Unix‐like environments, if PCRE is installed in your standard system
11libraries, you should be able to compile this program using this command:
12
13gcc ‐Wall pcredemo.c ‐lpcre ‐o pcredemo
14
15If PCRE is not installed in a standard place, it is likely to be installed with
16support for the pkg‐config mechanism. If you have pkg‐config, you can compile
17this program using this command:
18
19gcc ‐Wall pcredemo.c ‘pkg‐config ‐‐cflags ‐‐libs libpcre‘ ‐o pcredemo
20
21If you do not have pkg‐config, you may have to use this:
22
23gcc ‐Wall pcredemo.c ‐I/usr/local/include ‐L/usr/local/lib \
24  ‐R/usr/local/lib ‐lpcre ‐o pcredemo
25
26Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
27library files for PCRE are installed on your system. Only some operating
28systems (e.g. Solaris) use the ‐R option.
29
30Building under Windows:
31
32If you want to statically link this program against a non‐dll .a file, you must
33define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
34pcre_free() exported functions will be declared __declspec(dllimport), with
35unwanted results. So in this environment, uncomment the following line. */
36
37/* #define PCRE_STATIC */
38
39#include <stdio.h>
40#include <string.h>
41#include <pcre.h>
42
43#define OVECCOUNT 30    /* should be a multiple of 3 */
44
45
46int main(int argc, char **argv)
47{
48pcre *re;
49const char *error;
50char *pattern;
51char *subject;
52unsigned char *name_table;
53unsigned int option_bits;
54int erroffset;
55int find_all;
56int crlf_is_newline;
57int namecount;
58int name_entry_size;
59int ovector[OVECCOUNT];
60int subject_length;
61int rc, i;
62int utf8;
63
64
65/**************************************************************************
66* First, sort out the command line. There is only one possible option at  *
67* the moment, "‐g" to request repeated matching to find all occurrences,  *
68* like Perl’s /g option. We set the variable find_all to a non‐zero value *
69* if the ‐g option is present. Apart from that, there must be exactly two *
70* arguments.                                                              *
71**************************************************************************/
72
73find_all = 0;
74for (i = 1; i < argc; i++)
75  {
76  if (strcmp(argv[i], "‐g") == 0) find_all = 1;
77    else break;
78  }
79
80/* After the options, we require exactly two arguments, which are the pattern,
81and the subject string. */
82
83if (argc ‐ i != 2)
84  {
85  printf("Two arguments required: a regex and a subject string\n");
86  return 1;
87  }
88
89pattern = argv[i];
90subject = argv[i+1];
91subject_length = (int)strlen(subject);
92
93
94/*************************************************************************
95* Now we are going to compile the regular expression pattern, and handle *
96* and errors that are detected.                                          *
97*************************************************************************/
98
99re = pcre_compile(
100  pattern,              /* the pattern */
101  0,                    /* default options */
102  &error,               /* for error message */
103  &erroffset,           /* for error offset */
104  NULL);                /* use default character tables */
105
106/* Compilation failed: print the error message and exit */
107
108if (re == NULL)
109  {
110  printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
111  return 1;
112  }
113
114
115/*************************************************************************
116* If the compilation succeeded, we call PCRE again, in order to do a     *
117* pattern match against the subject string. This does just ONE match. If *
118* further matching is needed, it will be done below.                     *
119*************************************************************************/
120
121rc = pcre_exec(
122  re,                   /* the compiled pattern */
123  NULL,                 /* no extra data ‐ we didn’t study the pattern */
124  subject,              /* the subject string */
125  subject_length,       /* the length of the subject */
126  0,                    /* start at offset 0 in the subject */
127  0,                    /* default options */
128  ovector,              /* output vector for substring information */
129  OVECCOUNT);           /* number of elements in the output vector */
130
131/* Matching failed: handle error cases */
132
133if (rc < 0)
134  {
135  switch(rc)
136    {
137    case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
138    /*
139    Handle other special cases if you like
140    */
141    default: printf("Matching error %d\n", rc); break;
142    }
143  pcre_free(re);     /* Release memory used for the compiled pattern */
144  return 1;
145  }
146
147/* Match succeded */
148
149printf("\nMatch succeeded at offset %d\n", ovector[0]);
150
151
152/*************************************************************************
153* We have found the first match within the subject string. If the output *
154* vector wasn’t big enough, say so. Then output any substrings that were *
155* captured.                                                              *
156*************************************************************************/
157
158/* The output vector wasn’t big enough */
159
160if (rc == 0)
161  {
162  rc = OVECCOUNT/3;
163  printf("ovector only has room for %d captured substrings\n", rc ‐ 1);
164  }
165
166/* Show substrings stored in the output vector by number. Obviously, in a real
167application you might want to do things other than print them. */
168
169for (i = 0; i < rc; i++)
170  {
171  char *substring_start = subject + ovector[2*i];
172  int substring_length = ovector[2*i+1] ‐ ovector[2*i];
173  printf("%2d: %.*s\n", i, substring_length, substring_start);
174  }
175
176
177/**************************************************************************
178* That concludes the basic part of this demonstration program. We have    *
179* compiled a pattern, and performed a single match. The code that follows *
180* shows first how to access named substrings, and then how to code for    *
181* repeated matches on the same subject.                                   *
182**************************************************************************/
183
184/* See if there are any named substrings, and if so, show them by name. First
185we have to extract the count of named parentheses from the pattern. */
186
187(void)pcre_fullinfo(
188  re,                   /* the compiled pattern */
189  NULL,                 /* no extra data ‐ we didn’t study the pattern */
190  PCRE_INFO_NAMECOUNT,  /* number of named substrings */
191  &namecount);          /* where to put the answer */
192
193if (namecount <= 0) printf("No named substrings\n"); else
194  {
195  unsigned char *tabptr;
196  printf("Named substrings\n");
197
198  /* Before we can access the substrings, we must extract the table for
199  translating names to numbers, and the size of each entry in the table. */
200
201  (void)pcre_fullinfo(
202    re,                       /* the compiled pattern */
203    NULL,                     /* no extra data ‐ we didn’t study the pattern */
204    PCRE_INFO_NAMETABLE,      /* address of the table */
205    &name_table);             /* where to put the answer */
206
207  (void)pcre_fullinfo(
208    re,                       /* the compiled pattern */
209    NULL,                     /* no extra data ‐ we didn’t study the pattern */
210    PCRE_INFO_NAMEENTRYSIZE,  /* size of each entry in the table */
211    &name_entry_size);        /* where to put the answer */
212
213  /* Now we can scan the table and, for each entry, print the number, the name,
214  and the substring itself. */
215
216  tabptr = name_table;
217  for (i = 0; i < namecount; i++)
218    {
219    int n = (tabptr[0] << 8) | tabptr[1];
220    printf("(%d) %*s: %.*s\n", n, name_entry_size ‐ 3, tabptr + 2,
221      ovector[2*n+1] ‐ ovector[2*n], subject + ovector[2*n]);
222    tabptr += name_entry_size;
223    }
224  }
225
226
227/*************************************************************************
228* If the "‐g" option was given on the command line, we want to continue  *
229* to search for additional matches in the subject string, in a similar   *
230* way to the /g option in Perl. This turns out to be trickier than you   *
231* might think because of the possibility of matching an empty string.    *
232* What happens is as follows:                                            *
233*                                                                        *
234* If the previous match was NOT for an empty string, we can just start   *
235* the next match at the end of the previous one.                         *
236*                                                                        *
237* If the previous match WAS for an empty string, we can’t do that, as it *
238* would lead to an infinite loop. Instead, a special call of pcre_exec() *
239* is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set.    *
240* The first of these tells PCRE that an empty string at the start of the *
241* subject is not a valid match; other possibilities must be tried. The   *
242* second flag restricts PCRE to one match attempt at the initial string  *
243* position. If this match succeeds, an alternative to the empty string   *
244* match has been found, and we can print it and proceed round the loop,  *
245* advancing by the length of whatever was found. If this match does not  *
246* succeed, we still stay in the loop, advancing by just one character.   *
247* In UTF‐8 mode, which can be set by (*UTF8) in the pattern, this may be *
248* more than one byte.                                                    *
249*                                                                        *
250* However, there is a complication concerned with newlines. When the     *
251* newline convention is such that CRLF is a valid newline, we must       *
252* advance by two characters rather than one. The newline convention can  *
253* be set in the regex by (*CR), etc.; if not, we must find the default.  *
254*************************************************************************/
255
256if (!find_all)     /* Check for ‐g */
257  {
258  pcre_free(re);   /* Release the memory used for the compiled pattern */
259  return 0;        /* Finish unless ‐g was given */
260  }
261
262/* Before running the loop, check for UTF‐8 and whether CRLF is a valid newline
263sequence. First, find the options with which the regex was compiled; extract
264the UTF‐8 state, and mask off all but the newline options. */
265
266(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
267utf8 = option_bits & PCRE_UTF8;
268option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
269               PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
270
271/* If no newline options were set, find the default newline convention from the
272build configuration. */
273
274if (option_bits == 0)
275  {
276  int d;
277  (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
278  /* Note that these values are always the ASCII ones, even in
279  EBCDIC environments. CR = 13, NL = 10. */
280  option_bits = (d == 13)? PCRE_NEWLINE_CR :
281          (d == 10)? PCRE_NEWLINE_LF :
282          (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
283          (d == ‐2)? PCRE_NEWLINE_ANYCRLF :
284          (d == ‐1)? PCRE_NEWLINE_ANY : 0;
285  }
286
287/* See if CRLF is a valid newline sequence. */
288
289crlf_is_newline =
290     option_bits == PCRE_NEWLINE_ANY ||
291     option_bits == PCRE_NEWLINE_CRLF ||
292     option_bits == PCRE_NEWLINE_ANYCRLF;
293
294/* Loop for second and subsequent matches */
295
296for (;;)
297  {
298  int options = 0;                 /* Normally no options */
299  int start_offset = ovector[1];   /* Start at end of previous match */
300
301  /* If the previous match was for an empty string, we are finished if we are
302  at the end of the subject. Otherwise, arrange to run another match at the
303  same point to see if a non‐empty match can be found. */
304
305  if (ovector[0] == ovector[1])
306    {
307    if (ovector[0] == subject_length) break;
308    options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
309    }
310
311  /* Run the next matching operation */
312
313  rc = pcre_exec(
314    re,                   /* the compiled pattern */
315    NULL,                 /* no extra data ‐ we didn’t study the pattern */
316    subject,              /* the subject string */
317    subject_length,       /* the length of the subject */
318    start_offset,         /* starting offset in the subject */
319    options,              /* options */
320    ovector,              /* output vector for substring information */
321    OVECCOUNT);           /* number of elements in the output vector */
322
323  /* This time, a result of NOMATCH isn’t an error. If the value in "options"
324  is zero, it just means we have found all possible matches, so the loop ends.
325  Otherwise, it means we have failed to find a non‐empty‐string match at a
326  point where there was a previous empty‐string match. In this case, we do what
327  Perl does: advance the matching position by one character, and continue. We
328  do this by setting the "end of previous match" offset, because that is picked
329  up at the top of the loop as the point at which to start again.
330
331  There are two complications: (a) When CRLF is a valid newline sequence, and
332  the current position is just before it, advance by an extra byte. (b)
333  Otherwise we must ensure that we skip an entire UTF‐8 character if we are in
334  UTF‐8 mode. */
335
336  if (rc == PCRE_ERROR_NOMATCH)
337    {
338    if (options == 0) break;                    /* All matches found */
339    ovector[1] = start_offset + 1;              /* Advance one byte */
340    if (crlf_is_newline &&                      /* If CRLF is newline & */
341        start_offset < subject_length ‐ 1 &&    /* we are at CRLF, */
342        subject[start_offset] == ’\r’ &&
343        subject[start_offset + 1] == ’\n’)
344      ovector[1] += 1;                          /* Advance by one more. */
345    else if (utf8)                              /* Otherwise, ensure we */
346      {                                         /* advance a whole UTF‐8 */
347      while (ovector[1] < subject_length)       /* character. */
348        {
349        if ((subject[ovector[1]] & 0xc0) != 0x80) break;
350        ovector[1] += 1;
351        }
352      }
353    continue;    /* Go round the loop again */
354    }
355
356  /* Other matching errors are not recoverable. */
357
358  if (rc < 0)
359    {
360    printf("Matching error %d\n", rc);
361    pcre_free(re);    /* Release memory used for the compiled pattern */
362    return 1;
363    }
364
365  /* Match succeded */
366
367  printf("\nMatch succeeded again at offset %d\n", ovector[0]);
368
369  /* The match succeeded, but the output vector wasn’t big enough. */
370
371  if (rc == 0)
372    {
373    rc = OVECCOUNT/3;
374    printf("ovector only has room for %d captured substrings\n", rc ‐ 1);
375    }
376
377  /* As before, show substrings stored in the output vector by number, and then
378  also any named substrings. */
379
380  for (i = 0; i < rc; i++)
381    {
382    char *substring_start = subject + ovector[2*i];
383    int substring_length = ovector[2*i+1] ‐ ovector[2*i];
384    printf("%2d: %.*s\n", i, substring_length, substring_start);
385    }
386
387  if (namecount <= 0) printf("No named substrings\n"); else
388    {
389    unsigned char *tabptr = name_table;
390    printf("Named substrings\n");
391    for (i = 0; i < namecount; i++)
392      {
393      int n = (tabptr[0] << 8) | tabptr[1];
394      printf("(%d) %*s: %.*s\n", n, name_entry_size ‐ 3, tabptr + 2,
395        ovector[2*n+1] ‐ ovector[2*n], subject + ovector[2*n]);
396      tabptr += name_entry_size;
397      }
398    }
399  }      /* End of loop to find second and subsequent matches */
400
401printf("\n");
402pcre_free(re);       /* Release memory used for the compiled pattern */
403return 0;
404}
405
406/* End of pcredemo.c */
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
Impressum