1PCRE2API(3) Library Functions Manual PCRE2API(3)
2
3
4
6 PCRE2 - Perl-compatible regular expressions (revised API)
7
8 #include <pcre2.h>
9
10 PCRE2 is a new API for PCRE. This document contains a description of
11 all its functions. See the pcre2 document for an overview of all the
12 PCRE2 documentation.
13
15
16 pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
17 uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
18 pcre2_compile_context *ccontext);
19
20 void pcre2_code_free(pcre2_code *code);
21
22 pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
23 pcre2_general_context *gcontext);
24
25 pcre2_match_data *pcre2_match_data_create_from_pattern(
26 const pcre2_code *code, pcre2_general_context *gcontext);
27
28 int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
29 PCRE2_SIZE length, PCRE2_SIZE startoffset,
30 uint32_t options, pcre2_match_data *match_data,
31 pcre2_match_context *mcontext);
32
33 int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
34 PCRE2_SIZE length, PCRE2_SIZE startoffset,
35 uint32_t options, pcre2_match_data *match_data,
36 pcre2_match_context *mcontext,
37 int *workspace, PCRE2_SIZE wscount);
38
39 void pcre2_match_data_free(pcre2_match_data *match_data);
40
42
43 PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
44
45 uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
46
47 PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
48
49 PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
50
52
53 pcre2_general_context *pcre2_general_context_create(
54 void *(*private_malloc)(PCRE2_SIZE, void *),
55 void (*private_free)(void *, void *), void *memory_data);
56
57 pcre2_general_context *pcre2_general_context_copy(
58 pcre2_general_context *gcontext);
59
60 void pcre2_general_context_free(pcre2_general_context *gcontext);
61
63
64 pcre2_compile_context *pcre2_compile_context_create(
65 pcre2_general_context *gcontext);
66
67 pcre2_compile_context *pcre2_compile_context_copy(
68 pcre2_compile_context *ccontext);
69
70 void pcre2_compile_context_free(pcre2_compile_context *ccontext);
71
72 int pcre2_set_bsr(pcre2_compile_context *ccontext,
73 uint32_t value);
74
75 int pcre2_set_character_tables(pcre2_compile_context *ccontext,
76 const unsigned char *tables);
77
78 int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
79 PCRE2_SIZE value);
80
81 int pcre2_set_newline(pcre2_compile_context *ccontext,
82 uint32_t value);
83
84 int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
85 uint32_t value);
86
87 int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
88 int (*guard_function)(uint32_t, void *), void *user_data);
89
91
92 pcre2_match_context *pcre2_match_context_create(
93 pcre2_general_context *gcontext);
94
95 pcre2_match_context *pcre2_match_context_copy(
96 pcre2_match_context *mcontext);
97
98 void pcre2_match_context_free(pcre2_match_context *mcontext);
99
100 int pcre2_set_callout(pcre2_match_context *mcontext,
101 int (*callout_function)(pcre2_callout_block *, void *),
102 void *callout_data);
103
104 int pcre2_set_match_limit(pcre2_match_context *mcontext,
105 uint32_t value);
106
107 int pcre2_set_offset_limit(pcre2_match_context *mcontext,
108 PCRE2_SIZE value);
109
110 int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
111 uint32_t value);
112
113 int pcre2_set_recursion_memory_management(
114 pcre2_match_context *mcontext,
115 void *(*private_malloc)(PCRE2_SIZE, void *),
116 void (*private_free)(void *, void *), void *memory_data);
117
119
120 int pcre2_substring_copy_byname(pcre2_match_data *match_data,
121 PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
122
123 int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
124 uint32_t number, PCRE2_UCHAR *buffer,
125 PCRE2_SIZE *bufflen);
126
127 void pcre2_substring_free(PCRE2_UCHAR *buffer);
128
129 int pcre2_substring_get_byname(pcre2_match_data *match_data,
130 PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
131
132 int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
133 uint32_t number, PCRE2_UCHAR **bufferptr,
134 PCRE2_SIZE *bufflen);
135
136 int pcre2_substring_length_byname(pcre2_match_data *match_data,
137 PCRE2_SPTR name, PCRE2_SIZE *length);
138
139 int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
140 uint32_t number, PCRE2_SIZE *length);
141
142 int pcre2_substring_nametable_scan(const pcre2_code *code,
143 PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
144
145 int pcre2_substring_number_from_name(const pcre2_code *code,
146 PCRE2_SPTR name);
147
148 void pcre2_substring_list_free(PCRE2_SPTR *list);
149
150 int pcre2_substring_list_get(pcre2_match_data *match_data,
151 PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
152
154
155 int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
156 PCRE2_SIZE length, PCRE2_SIZE startoffset,
157 uint32_t options, pcre2_match_data *match_data,
158 pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
159 PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
160 PCRE2_SIZE *outlengthptr);
161
163
164 int pcre2_jit_compile(pcre2_code *code, uint32_t options);
165
166 int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
167 PCRE2_SIZE length, PCRE2_SIZE startoffset,
168 uint32_t options, pcre2_match_data *match_data,
169 pcre2_match_context *mcontext);
170
171 void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
172
173 pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
174 PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
175
176 void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
177 pcre2_jit_callback callback_function, void *callback_data);
178
179 void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
180
182
183 int32_t pcre2_serialize_decode(pcre2_code **codes,
184 int32_t number_of_codes, const uint8_t *bytes,
185 pcre2_general_context *gcontext);
186
187 int32_t pcre2_serialize_encode(const pcre2_code **codes,
188 int32_t number_of_codes, uint8_t **serialized_bytes,
189 PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
190
191 void pcre2_serialize_free(uint8_t *bytes);
192
193 int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
194
196
197 pcre2_code *pcre2_code_copy(const pcre2_code *code);
198
199 pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
200
201 int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
202 PCRE2_SIZE bufflen);
203
204 const unsigned char *pcre2_maketables(pcre2_general_context *gcontext);
205
206 int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
207
208 int pcre2_callout_enumerate(const pcre2_code *code,
209 int (*callback)(pcre2_callout_enumerate_block *, void *),
210 void *user_data);
211
212 int pcre2_config(uint32_t what, void *where);
213
215
216 There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit
217 code units, respectively. However, there is just one header file,
218 pcre2.h. This contains the function prototypes and other definitions
219 for all three libraries. One, two, or all three can be installed simul‐
220 taneously. On Unix-like systems the libraries are called libpcre2-8,
221 libpcre2-16, and libpcre2-32, and they can also co-exist with the orig‐
222 inal PCRE libraries.
223
224 Character strings are passed to and from a PCRE2 library as a sequence
225 of unsigned integers in code units of the appropriate width. Every
226 PCRE2 function comes in three different forms, one for each library,
227 for example:
228
229 pcre2_compile_8()
230 pcre2_compile_16()
231 pcre2_compile_32()
232
233 There are also three different sets of data types:
234
235 PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
236 PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32
237
238 The UCHAR types define unsigned code units of the appropriate widths.
239 For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR
240 types are constant pointers to the equivalent UCHAR types, that is,
241 they are pointers to vectors of unsigned code units.
242
243 Many applications use only one code unit width. For their convenience,
244 macros are defined whose names are the generic forms such as pcre2_com‐
245 pile() and PCRE2_SPTR. These macros use the value of the macro
246 PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific func‐
247 tion and macro names. PCRE2_CODE_UNIT_WIDTH is not defined by default.
248 An application must define it to be 8, 16, or 32 before including
249 pcre2.h in order to make use of the generic names.
250
251 Applications that use more than one code unit width can be linked with
252 more than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to
253 be 0 before including pcre2.h, and then use the real function names.
254 Any code that is to be included in an environment where the value of
255 PCRE2_CODE_UNIT_WIDTH is unknown should also use the real function
256 names. (Unfortunately, it is not possible in C code to save and restore
257 the value of a macro.)
258
259 If PCRE2_CODE_UNIT_WIDTH is not defined before including pcre2.h, a
260 compiler error occurs.
261
262 When using multiple libraries in an application, you must take care
263 when processing any particular pattern to use only functions from a
264 single library. For example, if you want to run a match using a pat‐
265 tern that was compiled with pcre2_compile_16(), you must do so with
266 pcre2_match_16(), not pcre2_match_8().
267
268 In the function summaries above, and in the rest of this document and
269 other PCRE2 documents, functions and data types are described using
270 their generic names, without the 8, 16, or 32 suffix.
271
273
274 PCRE2 has its own native API, which is described in this document.
275 There are also some wrapper functions for the 8-bit library that corre‐
276 spond to the POSIX regular expression API, but they do not give access
277 to all the functionality. They are described in the pcre2posix documen‐
278 tation. Both these APIs define a set of C function calls.
279
280 The native API C data types, function prototypes, option values, and
281 error codes are defined in the header file pcre2.h, which contains def‐
282 initions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
283 numbers for the library. Applications can use these to include support
284 for different releases of PCRE2.
285
286 In a Windows environment, if you want to statically link an application
287 program against a non-dll PCRE2 library, you must define PCRE2_STATIC
288 before including pcre2.h.
289
290 The functions pcre2_compile(), and pcre2_match() are used for compiling
291 and matching regular expressions in a Perl-compatible manner. A sample
292 program that demonstrates the simplest way of using them is provided in
293 the file called pcre2demo.c in the PCRE2 source distribution. A listing
294 of this program is given in the pcre2demo documentation, and the
295 pcre2sample documentation describes how to compile and run it.
296
297 Just-in-time compiler support is an optional feature of PCRE2 that can
298 be built in appropriate hardware environments. It greatly speeds up the
299 matching performance of many patterns. Programs can request that it be
300 used if available, by calling pcre2_jit_compile() after a pattern has
301 been successfully compiled by pcre2_compile(). This does nothing if JIT
302 support is not available.
303
304 More complicated programs might need to make use of the specialist
305 functions pcre2_jit_stack_create(), pcre2_jit_stack_free(), and
306 pcre2_jit_stack_assign() in order to control the JIT code's memory
307 usage.
308
309 JIT matching is automatically used by pcre2_match() if it is available,
310 unless the PCRE2_NO_JIT option is set. There is also a direct interface
311 for JIT matching, which gives improved performance. The JIT-specific
312 functions are discussed in the pcre2jit documentation.
313
314 A second matching function, pcre2_dfa_match(), which is not Perl-com‐
315 patible, is also provided. This uses a different algorithm for the
316 matching. The alternative algorithm finds all possible matches (at a
317 given point in the subject), and scans the subject just once (unless
318 there are lookbehind assertions). However, this algorithm does not
319 return captured substrings. A description of the two matching algo‐
320 rithms and their advantages and disadvantages is given in the
321 pcre2matching documentation. There is no JIT support for
322 pcre2_dfa_match().
323
324 In addition to the main compiling and matching functions, there are
325 convenience functions for extracting captured substrings from a subject
326 string that has been matched by pcre2_match(). They are:
327
328 pcre2_substring_copy_byname()
329 pcre2_substring_copy_bynumber()
330 pcre2_substring_get_byname()
331 pcre2_substring_get_bynumber()
332 pcre2_substring_list_get()
333 pcre2_substring_length_byname()
334 pcre2_substring_length_bynumber()
335 pcre2_substring_nametable_scan()
336 pcre2_substring_number_from_name()
337
338 pcre2_substring_free() and pcre2_substring_list_free() are also pro‐
339 vided, to free the memory used for extracted strings.
340
341 The function pcre2_substitute() can be called to match a pattern and
342 return a copy of the subject string with substitutions for parts that
343 were matched.
344
345 Functions whose names begin with pcre2_serialize_ are used for saving
346 compiled patterns on disc or elsewhere, and reloading them later.
347
348 Finally, there are functions for finding out information about a com‐
349 piled pattern (pcre2_pattern_info()) and about the configuration with
350 which PCRE2 was built (pcre2_config()).
351
352 Functions with names ending with _free() are used for freeing memory
353 blocks of various sorts. In all cases, if one of these functions is
354 called with a NULL argument, it does nothing.
355
357
358 The PCRE2 API uses string lengths and offsets into strings of code
359 units in several places. These values are always of type PCRE2_SIZE,
360 which is an unsigned integer type, currently always defined as size_t.
361 The largest value that can be stored in such a type (that is
362 ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated
363 strings and unset offsets. Therefore, the longest string that can be
364 handled is one less than this maximum.
365
367
368 PCRE2 supports five different conventions for indicating line breaks in
369 strings: a single CR (carriage return) character, a single LF (line‐
370 feed) character, the two-character sequence CRLF, any of the three pre‐
371 ceding, or any Unicode newline sequence. The Unicode newline sequences
372 are the three just mentioned, plus the single characters VT (vertical
373 tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line
374 separator, U+2028), and PS (paragraph separator, U+2029).
375
376 Each of the first three conventions is used by at least one operating
377 system as its standard newline sequence. When PCRE2 is built, a default
378 can be specified. The default default is LF, which is the Unix stan‐
379 dard. However, the newline convention can be changed by an application
380 when calling pcre2_compile(), or it can be specified by special text at
381 the start of the pattern itself; this overrides any other settings. See
382 the pcre2pattern page for details of the special character sequences.
383
384 In the PCRE2 documentation the word "newline" is used to mean "the
385 character or pair of characters that indicate a line break". The choice
386 of newline convention affects the handling of the dot, circumflex, and
387 dollar metacharacters, the handling of #-comments in /x mode, and, when
388 CRLF is a recognized line ending sequence, the match position advance‐
389 ment for a non-anchored pattern. There is more detail about this in the
390 section on pcre2_match() options below.
391
392 The choice of newline convention does not affect the interpretation of
393 the \n or \r escape sequences, nor does it affect what \R matches; this
394 has its own separate convention.
395
397
398 In a multithreaded application it is important to keep thread-specific
399 data separate from data that can be shared between threads. The PCRE2
400 library code itself is thread-safe: it contains no static or global
401 variables. The API is designed to be fairly simple for non-threaded
402 applications while at the same time ensuring that multithreaded appli‐
403 cations can use it.
404
405 There are several different blocks of data that are used to pass infor‐
406 mation between the application and the PCRE2 libraries.
407
408 The compiled pattern
409
410 A pointer to the compiled form of a pattern is returned to the user
411 when pcre2_compile() is successful. The data in the compiled pattern is
412 fixed, and does not change when the pattern is matched. Therefore, it
413 is thread-safe, that is, the same compiled pattern can be used by more
414 than one thread simultaneously. For example, an application can compile
415 all its patterns at the start, before forking off multiple threads that
416 use them. However, if the just-in-time optimization feature is being
417 used, it needs separate memory stack areas for each thread. See the
418 pcre2jit documentation for more details.
419
420 In a more complicated situation, where patterns are compiled only when
421 they are first needed, but are still shared between threads, pointers
422 to compiled patterns must be protected from simultaneous writing by
423 multiple threads, at least until a pattern has been compiled. The logic
424 can be something like this:
425
426 Get a read-only (shared) lock (mutex) for pointer
427 if (pointer == NULL)
428 {
429 Get a write (unique) lock for pointer
430 pointer = pcre2_compile(...
431 }
432 Release the lock
433 Use pointer in pcre2_match()
434
435 Of course, testing for compilation errors should also be included in
436 the code.
437
438 If JIT is being used, but the JIT compilation is not being done immedi‐
439 ately, (perhaps waiting to see if the pattern is used often enough)
440 similar logic is required. JIT compilation updates a pointer within the
441 compiled code block, so a thread must gain unique write access to the
442 pointer before calling pcre2_jit_compile(). Alternatively,
443 pcre2_code_copy() or pcre2_code_copy_with_tables() can be used to
444 obtain a private copy of the compiled code.
445
446 Context blocks
447
448 The next main section below introduces the idea of "contexts" in which
449 PCRE2 functions are called. A context is nothing more than a collection
450 of parameters that control the way PCRE2 operates. Grouping a number of
451 parameters together in a context is a convenient way of passing them to
452 a PCRE2 function without using lots of arguments. The parameters that
453 are stored in contexts are in some sense "advanced features" of the
454 API. Many straightforward applications will not need to use contexts.
455
456 In a multithreaded application, if the parameters in a context are val‐
457 ues that are never changed, the same context can be used by all the
458 threads. However, if any thread needs to change any value in a context,
459 it must make its own thread-specific copy.
460
461 Match blocks
462
463 The matching functions need a block of memory for working space and for
464 storing the results of a match. This includes details of what was
465 matched, as well as additional information such as the name of a
466 (*MARK) setting. Each thread must provide its own copy of this memory.
467
469
470 Some PCRE2 functions have a lot of parameters, many of which are used
471 only by specialist applications, for example, those that use custom
472 memory management or non-standard character tables. To keep function
473 argument lists at a reasonable size, and at the same time to keep the
474 API extensible, "uncommon" parameters are passed to certain functions
475 in a context instead of directly. A context is just a block of memory
476 that holds the parameter values. Applications that do not need to
477 adjust any of the context parameters can pass NULL when a context
478 pointer is required.
479
480 There are three different types of context: a general context that is
481 relevant for several PCRE2 operations, a compile-time context, and a
482 match-time context.
483
484 The general context
485
486 At present, this context just contains pointers to (and data for)
487 external memory management functions that are called from several
488 places in the PCRE2 library. The context is named `general' rather than
489 specifically `memory' because in future other fields may be added. If
490 you do not want to supply your own custom memory management functions,
491 you do not need to bother with a general context. A general context is
492 created by:
493
494 pcre2_general_context *pcre2_general_context_create(
495 void *(*private_malloc)(PCRE2_SIZE, void *),
496 void (*private_free)(void *, void *), void *memory_data);
497
498 The two function pointers specify custom memory management functions,
499 whose prototypes are:
500
501 void *private_malloc(PCRE2_SIZE, void *);
502 void private_free(void *, void *);
503
504 Whenever code in PCRE2 calls these functions, the final argument is the
505 value of memory_data. Either of the first two arguments of the creation
506 function may be NULL, in which case the system memory management func‐
507 tions malloc() and free() are used. (This is not currently useful, as
508 there are no other fields in a general context, but in future there
509 might be.) The private_malloc() function is used (if supplied) to
510 obtain memory for storing the context, and all three values are saved
511 as part of the context.
512
513 Whenever PCRE2 creates a data block of any kind, the block contains a
514 pointer to the free() function that matches the malloc() function that
515 was used. When the time comes to free the block, this function is
516 called.
517
518 A general context can be copied by calling:
519
520 pcre2_general_context *pcre2_general_context_copy(
521 pcre2_general_context *gcontext);
522
523 The memory used for a general context should be freed by calling:
524
525 void pcre2_general_context_free(pcre2_general_context *gcontext);
526
527
528 The compile context
529
530 A compile context is required if you want to change the default values
531 of any of the following compile-time parameters:
532
533 What \R matches (Unicode newlines or CR, LF, CRLF only)
534 PCRE2's character tables
535 The newline character sequence
536 The compile time nested parentheses limit
537 The maximum length of the pattern string
538 An external function for stack checking
539
540 A compile context is also required if you are using custom memory man‐
541 agement. If none of these apply, just pass NULL as the context argu‐
542 ment of pcre2_compile().
543
544 A compile context is created, copied, and freed by the following func‐
545 tions:
546
547 pcre2_compile_context *pcre2_compile_context_create(
548 pcre2_general_context *gcontext);
549
550 pcre2_compile_context *pcre2_compile_context_copy(
551 pcre2_compile_context *ccontext);
552
553 void pcre2_compile_context_free(pcre2_compile_context *ccontext);
554
555 A compile context is created with default values for its parameters.
556 These can be changed by calling the following functions, which return 0
557 on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
558
559 int pcre2_set_bsr(pcre2_compile_context *ccontext,
560 uint32_t value);
561
562 The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only
563 CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any
564 Unicode line ending sequence. The value is used by the JIT compiler and
565 by the two interpreted matching functions, pcre2_match() and
566 pcre2_dfa_match().
567
568 int pcre2_set_character_tables(pcre2_compile_context *ccontext,
569 const unsigned char *tables);
570
571 The value must be the result of a call to pcre2_maketables(), whose
572 only argument is a general context. This function builds a set of char‐
573 acter tables in the current locale.
574
575 int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
576 PCRE2_SIZE value);
577
578 This sets a maximum length, in code units, for the pattern string that
579 is to be compiled. If the pattern is longer, an error is generated.
580 This facility is provided so that applications that accept patterns
581 from external sources can limit their size. The default is the largest
582 number that a PCRE2_SIZE variable can hold, which is effectively unlim‐
583 ited.
584
585 int pcre2_set_newline(pcre2_compile_context *ccontext,
586 uint32_t value);
587
588 This specifies which characters or character sequences are to be recog‐
589 nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
590 return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
591 two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
592 of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence).
593
594 When a pattern is compiled with the PCRE2_EXTENDED option, the value of
595 this parameter affects the recognition of white space and the end of
596 internal comments starting with #. The value is saved with the compiled
597 pattern for subsequent use by the JIT compiler and by the two inter‐
598 preted matching functions, pcre2_match() and pcre2_dfa_match().
599
600 int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
601 uint32_t value);
602
603 This parameter ajusts the limit, set when PCRE2 is built (default 250),
604 on the depth of parenthesis nesting in a pattern. This limit stops
605 rogue patterns using up too much system stack when being compiled. The
606 limit applies to parentheses of all kinds, not just capturing parenthe‐
607 ses.
608
609 int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
610 int (*guard_function)(uint32_t, void *), void *user_data);
611
612 There is at least one application that runs PCRE2 in threads with very
613 limited system stack, where running out of stack is to be avoided at
614 all costs. The parenthesis limit above cannot take account of how much
615 stack is actually available. For a finer control, you can supply a
616 function that is called whenever pcre2_compile() starts to compile a
617 parenthesized part of a pattern. This function can check the actual
618 stack size (or anything else that it wants to, of course).
619
620 The first argument to the callout function gives the current depth of
621 nesting, and the second is user data that is set up by the last argu‐
622 ment of pcre2_set_compile_recursion_guard(). The callout function
623 should return zero if all is well, or non-zero to force an error.
624
625 The match context
626
627 A match context is required if you want to change the default values of
628 any of the following match-time parameters:
629
630 A callout function
631 The offset limit for matching an unanchored pattern
632 The limit for calling match() (see below)
633 The limit for calling match() recursively
634
635 A match context is also required if you are using custom memory manage‐
636 ment. If none of these apply, just pass NULL as the context argument
637 of pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().
638
639 A match context is created, copied, and freed by the following func‐
640 tions:
641
642 pcre2_match_context *pcre2_match_context_create(
643 pcre2_general_context *gcontext);
644
645 pcre2_match_context *pcre2_match_context_copy(
646 pcre2_match_context *mcontext);
647
648 void pcre2_match_context_free(pcre2_match_context *mcontext);
649
650 A match context is created with default values for its parameters.
651 These can be changed by calling the following functions, which return 0
652 on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
653
654 int pcre2_set_callout(pcre2_match_context *mcontext,
655 int (*callout_function)(pcre2_callout_block *, void *),
656 void *callout_data);
657
658 This sets up a "callout" function, which PCRE2 will call at specified
659 points during a matching operation. Details are given in the pcre2call‐
660 out documentation.
661
662 int pcre2_set_offset_limit(pcre2_match_context *mcontext,
663 PCRE2_SIZE value);
664
665 The offset_limit parameter limits how far an unanchored search can
666 advance in the subject string. The default value is PCRE2_UNSET. The
667 pcre2_match() and pcre2_dfa_match() functions return
668 PCRE2_ERROR_NOMATCH if a match with a starting point before or at the
669 given offset is not found. For example, if the pattern /abc/ is matched
670 against "123abc" with an offset limit less than 3, the result is
671 PCRE2_ERROR_NO_MATCH. A match can never be found if the startoffset
672 argument of pcre2_match() or pcre2_dfa_match() is greater than the off‐
673 set limit.
674
675 When using this facility, you must set PCRE2_USE_OFFSET_LIMIT when
676 calling pcre2_compile() so that when JIT is in use, different code can
677 be compiled. If a match is started with a non-default match limit when
678 PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
679
680 The offset limit facility can be used to track progress when searching
681 large subject strings. See also the PCRE2_FIRSTLINE option, which
682 requires a match to start within the first line of the subject. If this
683 is set with an offset limit, a match must occur in the first line and
684 also within the offset limit. In other words, whichever limit comes
685 first is used.
686
687 int pcre2_set_match_limit(pcre2_match_context *mcontext,
688 uint32_t value);
689
690 The match_limit parameter provides a means of preventing PCRE2 from
691 using up too many resources when processing patterns that are not going
692 to match, but which have a very large number of possibilities in their
693 search trees. The classic example is a pattern that uses nested unlim‐
694 ited repeats.
695
696 Internally, pcre2_match() uses a function called match(), which it
697 calls repeatedly (sometimes recursively). The limit set by match_limit
698 is imposed on the number of times this function is called during a
699 match, which has the effect of limiting the amount of backtracking that
700 can take place. For patterns that are not anchored, the count restarts
701 from zero for each position in the subject string. This limit is not
702 relevant to pcre2_dfa_match(), which ignores it.
703
704 When pcre2_match() is called with a pattern that was successfully pro‐
705 cessed by pcre2_jit_compile(), the way in which matching is executed is
706 entirely different. However, there is still the possibility of runaway
707 matching that goes on for a very long time, and so the match_limit
708 value is also used in this case (but in a different way) to limit how
709 long the matching can continue.
710
711 The default value for the limit can be set when PCRE2 is built; the
712 default default is 10 million, which handles all but the most extreme
713 cases. If the limit is exceeded, pcre2_match() returns
714 PCRE2_ERROR_MATCHLIMIT. A value for the match limit may also be sup‐
715 plied by an item at the start of a pattern of the form
716
717 (*LIMIT_MATCH=ddd)
718
719 where ddd is a decimal number. However, such a setting is ignored
720 unless ddd is less than the limit set by the caller of pcre2_match()
721 or, if no such limit is set, less than the default.
722
723 int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
724 uint32_t value);
725
726 The recursion_limit parameter is similar to match_limit, but instead of
727 limiting the total number of times that match() is called, it limits
728 the depth of recursion. The recursion depth is a smaller number than
729 the total number of calls, because not all calls to match() are recur‐
730 sive. This limit is of use only if it is set smaller than match_limit.
731
732 Limiting the recursion depth limits the amount of system stack that can
733 be used, or, when PCRE2 has been compiled to use memory on the heap
734 instead of the stack, the amount of heap memory that can be used. This
735 limit is not relevant, and is ignored, when matching is done using JIT
736 compiled code. However, it is supported by pcre2_dfa_match(), which
737 uses recursive function calls less frequently than pcre2_match(), but
738 which can be caused to use a lot of stack by a recursive pattern such
739 as /(.)(?1)/ matched to a very long string.
740
741 The default value for recursion_limit can be set when PCRE2 is built;
742 the default default is the same value as the default for match_limit.
743 If the limit is exceeded, pcre2_match() and pcre2_dfa_match() return
744 PCRE2_ERROR_RECURSIONLIMIT. A value for the recursion limit may also be
745 supplied by an item at the start of a pattern of the form
746
747 (*LIMIT_RECURSION=ddd)
748
749 where ddd is a decimal number. However, such a setting is ignored
750 unless ddd is less than the limit set by the caller of pcre2_match() or
751 pcre2_dfa_match() or, if no such limit is set, less than the default.
752
753 int pcre2_set_recursion_memory_management(
754 pcre2_match_context *mcontext,
755 void *(*private_malloc)(PCRE2_SIZE, void *),
756 void (*private_free)(void *, void *), void *memory_data);
757
758 This function sets up two additional custom memory management functions
759 for use by pcre2_match() when PCRE2 is compiled to use the heap for
760 remembering backtracking data, instead of recursive function calls that
761 use the system stack. There is a discussion about PCRE2's stack usage
762 in the pcre2stack documentation. See the pcre2build documentation for
763 details of how to build PCRE2.
764
765 Using the heap for recursion is a non-standard way of building PCRE2,
766 for use in environments that have limited stacks. Because of the
767 greater use of memory management, pcre2_match() runs more slowly. Func‐
768 tions that are different to the general custom memory functions are
769 provided so that special-purpose external code can be used for this
770 case, because the memory blocks are all the same size. The blocks are
771 retained by pcre2_match() until it is about to exit so that they can be
772 re-used when possible during the match. In the absence of these func‐
773 tions, the normal custom memory management functions are used, if sup‐
774 plied, otherwise the system functions.
775
777
778 int pcre2_config(uint32_t what, void *where);
779
780 The function pcre2_config() makes it possible for a PCRE2 client to
781 discover which optional features have been compiled into the PCRE2
782 library. The pcre2build documentation has more details about these
783 optional features.
784
785 The first argument for pcre2_config() specifies which information is
786 required. The second argument is a pointer to memory into which the
787 information is placed. If NULL is passed, the function returns the
788 amount of memory that is needed for the requested information. For
789 calls that return numerical values, the value is in bytes; when
790 requesting these values, where should point to appropriately aligned
791 memory. For calls that return strings, the required length is given in
792 code units, not counting the terminating zero.
793
794 When requesting information, the returned value from pcre2_config() is
795 non-negative on success, or the negative error code PCRE2_ERROR_BADOP‐
796 TION if the value in the first argument is not recognized. The follow‐
797 ing information is available:
798
799 PCRE2_CONFIG_BSR
800
801 The output is a uint32_t integer whose value indicates what character
802 sequences the \R escape sequence matches by default. A value of
803 PCRE2_BSR_UNICODE means that \R matches any Unicode line ending
804 sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR,
805 LF, or CRLF. The default can be overridden when a pattern is compiled.
806
807 PCRE2_CONFIG_JIT
808
809 The output is a uint32_t integer that is set to one if support for
810 just-in-time compiling is available; otherwise it is set to zero.
811
812 PCRE2_CONFIG_JITTARGET
813
814 The where argument should point to a buffer that is at least 48 code
815 units long. (The exact length required can be found by calling
816 pcre2_config() with where set to NULL.) The buffer is filled with a
817 string that contains the name of the architecture for which the JIT
818 compiler is configured, for example "x86 32bit (little endian +
819 unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is
820 returned, otherwise the number of code units used is returned. This is
821 the length of the string, plus one unit for the terminating zero.
822
823 PCRE2_CONFIG_LINKSIZE
824
825 The output is a uint32_t integer that contains the number of bytes used
826 for internal linkage in compiled regular expressions. When PCRE2 is
827 configured, the value can be set to 2, 3, or 4, with the default being
828 2. This is the value that is returned by pcre2_config(). However, when
829 the 16-bit library is compiled, a value of 3 is rounded up to 4, and
830 when the 32-bit library is compiled, internal linkages always use 4
831 bytes, so the configured value is not relevant.
832
833 The default value of 2 for the 8-bit and 16-bit libraries is sufficient
834 for all but the most massive patterns, since it allows the size of the
835 compiled pattern to be up to 64K code units. Larger values allow larger
836 regular expressions to be compiled by those two libraries, but at the
837 expense of slower matching.
838
839 PCRE2_CONFIG_MATCHLIMIT
840
841 The output is a uint32_t integer that gives the default limit for the
842 number of internal matching function calls in a pcre2_match() execu‐
843 tion. Further details are given with pcre2_match() below.
844
845 PCRE2_CONFIG_NEWLINE
846
847 The output is a uint32_t integer whose value specifies the default
848 character sequence that is recognized as meaning "newline". The values
849 are:
850
851 PCRE2_NEWLINE_CR Carriage return (CR)
852 PCRE2_NEWLINE_LF Linefeed (LF)
853 PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF)
854 PCRE2_NEWLINE_ANY Any Unicode line ending
855 PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF
856
857 The default should normally correspond to the standard sequence for
858 your operating system.
859
860 PCRE2_CONFIG_PARENSLIMIT
861
862 The output is a uint32_t integer that gives the maximum depth of nest‐
863 ing of parentheses (of any kind) in a pattern. This limit is imposed to
864 cap the amount of system stack used when a pattern is compiled. It is
865 specified when PCRE2 is built; the default is 250. This limit does not
866 take into account the stack that may already be used by the calling
867 application. For finer control over compilation stack usage, see
868 pcre2_set_compile_recursion_guard().
869
870 PCRE2_CONFIG_RECURSIONLIMIT
871
872 The output is a uint32_t integer that gives the default limit for the
873 depth of recursion when calling the internal matching function in a
874 pcre2_match() execution. Further details are given with pcre2_match()
875 below.
876
877 PCRE2_CONFIG_STACKRECURSE
878
879 The output is a uint32_t integer that is set to one if internal recur‐
880 sion when running pcre2_match() is implemented by recursive function
881 calls that use the system stack to remember their state. This is the
882 usual way that PCRE2 is compiled. The output is zero if PCRE2 was com‐
883 piled to use blocks of data on the heap instead of recursive function
884 calls.
885
886 PCRE2_CONFIG_UNICODE_VERSION
887
888 The where argument should point to a buffer that is at least 24 code
889 units long. (The exact length required can be found by calling
890 pcre2_config() with where set to NULL.) If PCRE2 has been compiled
891 without Unicode support, the buffer is filled with the text "Unicode
892 not supported". Otherwise, the Unicode version string (for example,
893 "8.0.0") is inserted. The number of code units used is returned. This
894 is the length of the string plus one unit for the terminating zero.
895
896 PCRE2_CONFIG_UNICODE
897
898 The output is a uint32_t integer that is set to one if Unicode support
899 is available; otherwise it is set to zero. Unicode support implies UTF
900 support.
901
902 PCRE2_CONFIG_VERSION
903
904 The where argument should point to a buffer that is at least 12 code
905 units long. (The exact length required can be found by calling
906 pcre2_config() with where set to NULL.) The buffer is filled with the
907 PCRE2 version string, zero-terminated. The number of code units used is
908 returned. This is the length of the string plus one unit for the termi‐
909 nating zero.
910
912
913 pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
914 uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
915 pcre2_compile_context *ccontext);
916
917 void pcre2_code_free(pcre2_code *code);
918
919 pcre2_code *pcre2_code_copy(const pcre2_code *code);
920
921 pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
922
923 The pcre2_compile() function compiles a pattern into an internal form.
924 The pattern is defined by a pointer to a string of code units and a
925 length. If the pattern is zero-terminated, the length can be specified
926 as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of
927 memory that contains the compiled pattern and related data, or NULL if
928 an error occurred.
929
930 If the compile context argument ccontext is NULL, memory for the com‐
931 piled pattern is obtained by calling malloc(). Otherwise, it is
932 obtained from the same memory function that was used for the compile
933 context. The caller must free the memory by calling pcre2_code_free()
934 when it is no longer needed.
935
936 The function pcre2_code_copy() makes a copy of the compiled code in new
937 memory, using the same memory allocator as was used for the original.
938 However, if the code has been processed by the JIT compiler (see
939 below), the JIT information cannot be copied (because it is position-
940 dependent). The new copy can initially be used only for non-JIT match‐
941 ing, though it can be passed to pcre2_jit_compile() if required.
942
943 The pcre2_code_copy() function provides a way for individual threads in
944 a multithreaded application to acquire a private copy of shared com‐
945 piled code. However, it does not make a copy of the character tables
946 used by the compiled pattern; the new pattern code points to the same
947 tables as the original code. (See "Locale Support" below for details
948 of these character tables.) In many applications the same tables are
949 used throughout, so this behaviour is appropriate. Nevertheless, there
950 are occasions when a copy of a compiled pattern and the relevant tables
951 are needed. The pcre2_code_copy_with_tables() provides this facility.
952 Copies of both the code and the tables are made, with the new code
953 pointing to the new tables. The memory for the new tables is automati‐
954 cally freed when pcre2_code_free() is called for the new copy of the
955 compiled code.
956
957 NOTE: When one of the matching functions is called, pointers to the
958 compiled pattern and the subject string are set in the match data block
959 so that they can be referenced by the substring extraction functions.
960 After running a match, you must not free a compiled pattern (or a sub‐
961 ject string) until after all operations on the match data block have
962 taken place.
963
964 The options argument for pcre2_compile() contains various bit settings
965 that affect the compilation. It should be zero if no options are
966 required. The available options are described below. Some of them (in
967 particular, those that are compatible with Perl, but some others as
968 well) can also be set and unset from within the pattern (see the
969 detailed description in the pcre2pattern documentation).
970
971 For those options that can be different in different parts of the pat‐
972 tern, the contents of the options argument specifies their settings at
973 the start of compilation. The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK
974 options can be set at the time of matching as well as at compile time.
975
976 Other, less frequently required compile-time parameters (for example,
977 the newline setting) can be provided in a compile context (as described
978 above).
979
980 If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme‐
981 diately. Otherwise, the variables to which these point are set to an
982 error code and an offset (number of code units) within the pattern,
983 respectively, when pcre2_compile() returns NULL because a compilation
984 error has occurred. The values are not defined when compilation is suc‐
985 cessful and pcre2_compile() returns a non-NULL value.
986
987 The value returned in erroroffset is an indication of where in the pat‐
988 tern the error occurred. It is not necessarily the furthest point in
989 the pattern that was read. For example, after the error "lookbehind
990 assertion is not fixed length", the error offset points to the start of
991 the failing assertion.
992
993 The pcre2_get_error_message() function (see "Obtaining a textual error
994 message" below) provides a textual message for each error code. Compi‐
995 lation errors have positive error codes; UTF formatting error codes are
996 negative. For an invalid UTF-8 or UTF-16 string, the offset is that of
997 the first code unit of the failing character.
998
999 Some errors are not detected until the whole pattern has been scanned;
1000 in these cases, the offset passed back is the length of the pattern.
1001 Note that the offset is in code units, not characters, even in a UTF
1002 mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char‐
1003 acter.
1004
1005 This code fragment shows a typical straightforward call to pcre2_com‐
1006 pile():
1007
1008 pcre2_code *re;
1009 PCRE2_SIZE erroffset;
1010 int errorcode;
1011 re = pcre2_compile(
1012 "^A.*Z", /* the pattern */
1013 PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */
1014 0, /* default options */
1015 &errorcode, /* for error code */
1016 &erroffset, /* for error offset */
1017 NULL); /* no compile context */
1018
1019 The following names for option bits are defined in the pcre2.h header
1020 file:
1021
1022 PCRE2_ANCHORED
1023
1024 If this bit is set, the pattern is forced to be "anchored", that is, it
1025 is constrained to match only at the first matching point in the string
1026 that is being searched (the "subject string"). This effect can also be
1027 achieved by appropriate constructs in the pattern itself, which is the
1028 only way to do it in Perl.
1029
1030 PCRE2_ALLOW_EMPTY_CLASS
1031
1032 By default, for compatibility with Perl, a closing square bracket that
1033 immediately follows an opening one is treated as a data character for
1034 the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the
1035 class, which therefore contains no characters and so can never match.
1036
1037 PCRE2_ALT_BSUX
1038
1039 This option request alternative handling of three escape sequences,
1040 which makes PCRE2's behaviour more like ECMAscript (aka JavaScript).
1041 When it is set:
1042
1043 (1) \U matches an upper case "U" character; by default \U causes a com‐
1044 pile time error (Perl uses \U to upper case subsequent characters).
1045
1046 (2) \u matches a lower case "u" character unless it is followed by four
1047 hexadecimal digits, in which case the hexadecimal number defines the
1048 code point to match. By default, \u causes a compile time error (Perl
1049 uses it to upper case the following character).
1050
1051 (3) \x matches a lower case "x" character unless it is followed by two
1052 hexadecimal digits, in which case the hexadecimal number defines the
1053 code point to match. By default, as in Perl, a hexadecimal number is
1054 always expected after \x, but it may have zero, one, or two digits (so,
1055 for example, \xz matches a binary zero character followed by z).
1056
1057 PCRE2_ALT_CIRCUMFLEX
1058
1059 In multiline mode (when PCRE2_MULTILINE is set), the circumflex
1060 metacharacter matches at the start of the subject (unless PCRE2_NOTBOL
1061 is set), and also after any internal newline. However, it does not
1062 match after a newline at the end of the subject, for compatibility with
1063 Perl. If you want a multiline circumflex also to match after a termi‐
1064 nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
1065
1066 PCRE2_ALT_VERBNAMES
1067
1068 By default, for compatibility with Perl, the name in any verb sequence
1069 such as (*MARK:NAME) is any sequence of characters that does not
1070 include a closing parenthesis. The name is not processed in any way,
1071 and it is not possible to include a closing parenthesis in the name.
1072 However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
1073 processing is applied to verb names and only an unescaped closing
1074 parenthesis terminates the name. A closing parenthesis can be included
1075 in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED
1076 option is set, unescaped whitespace in verb names is skipped and #-com‐
1077 ments are recognized, exactly as in the rest of the pattern.
1078
1079 PCRE2_AUTO_CALLOUT
1080
1081 If this bit is set, pcre2_compile() automatically inserts callout
1082 items, all with number 255, before each pattern item, except immedi‐
1083 ately before or after a callout in the pattern. For discussion of the
1084 callout facility, see the pcre2callout documentation.
1085
1086 PCRE2_CASELESS
1087
1088 If this bit is set, letters in the pattern match both upper and lower
1089 case letters in the subject. It is equivalent to Perl's /i option, and
1090 it can be changed within a pattern by a (?i) option setting.
1091
1092 PCRE2_DOLLAR_ENDONLY
1093
1094 If this bit is set, a dollar metacharacter in the pattern matches only
1095 at the end of the subject string. Without this option, a dollar also
1096 matches immediately before a newline at the end of the string (but not
1097 before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored
1098 if PCRE2_MULTILINE is set. There is no equivalent to this option in
1099 Perl, and no way to set it within a pattern.
1100
1101 PCRE2_DOTALL
1102
1103 If this bit is set, a dot metacharacter in the pattern matches any
1104 character, including one that indicates a newline. However, it only
1105 ever matches one character, even if newlines are coded as CRLF. Without
1106 this option, a dot does not match when the current position in the sub‐
1107 ject is at a newline. This option is equivalent to Perl's /s option,
1108 and it can be changed within a pattern by a (?s) option setting. A neg‐
1109 ative class such as [^a] always matches newline characters, independent
1110 of the setting of this option.
1111
1112 PCRE2_DUPNAMES
1113
1114 If this bit is set, names used to identify capturing subpatterns need
1115 not be unique. This can be helpful for certain types of pattern when it
1116 is known that only one instance of the named subpattern can ever be
1117 matched. There are more details of named subpatterns below; see also
1118 the pcre2pattern documentation.
1119
1120 PCRE2_EXTENDED
1121
1122 If this bit is set, most white space characters in the pattern are
1123 totally ignored except when escaped or inside a character class. How‐
1124 ever, white space is not allowed within sequences such as (?> that
1125 introduce various parenthesized subpatterns, nor within numerical quan‐
1126 tifiers such as {1,3}. Ignorable white space is permitted between an
1127 item and a following quantifier and between a quantifier and a follow‐
1128 ing + that indicates possessiveness.
1129
1130 PCRE2_EXTENDED also causes characters between an unescaped # outside a
1131 character class and the next newline, inclusive, to be ignored, which
1132 makes it possible to include comments inside complicated patterns. Note
1133 that the end of this type of comment is a literal newline sequence in
1134 the pattern; escape sequences that happen to represent a newline do not
1135 count. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be
1136 changed within a pattern by a (?x) option setting.
1137
1138 Which characters are interpreted as newlines can be specified by a set‐
1139 ting in the compile context that is passed to pcre2_compile() or by a
1140 special sequence at the start of the pattern, as described in the sec‐
1141 tion entitled "Newline conventions" in the pcre2pattern documentation.
1142 A default is defined when PCRE2 is built.
1143
1144 PCRE2_FIRSTLINE
1145
1146 If this option is set, an unanchored pattern is required to match
1147 before or at the first newline in the subject string, though the
1148 matched text may continue over the newline. See also PCRE2_USE_OFF‐
1149 SET_LIMIT, which provides a more general limiting facility. If
1150 PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the
1151 first line and also within the offset limit. In other words, whichever
1152 limit comes first is used.
1153
1154 PCRE2_MATCH_UNSET_BACKREF
1155
1156 If this option is set, a back reference to an unset subpattern group
1157 matches an empty string (by default this causes the current matching
1158 alternative to fail). A pattern such as (\1)(a) succeeds when this
1159 option is set (assuming it can find an "a" in the subject), whereas it
1160 fails by default, for Perl compatibility. Setting this option makes
1161 PCRE2 behave more like ECMAscript (aka JavaScript).
1162
1163 PCRE2_MULTILINE
1164
1165 By default, for the purposes of matching "start of line" and "end of
1166 line", PCRE2 treats the subject string as consisting of a single line
1167 of characters, even if it actually contains newlines. The "start of
1168 line" metacharacter (^) matches only at the start of the string, and
1169 the "end of line" metacharacter ($) matches only at the end of the
1170 string, or before a terminating newline (except when PCRE2_DOL‐
1171 LAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set,
1172 the "any character" metacharacter (.) does not match at a newline. This
1173 behaviour (for ^, $, and dot) is the same as Perl.
1174
1175 When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
1176 constructs match immediately following or immediately before internal
1177 newlines in the subject string, respectively, as well as at the very
1178 start and end. This is equivalent to Perl's /m option, and it can be
1179 changed within a pattern by a (?m) option setting. Note that the "start
1180 of line" metacharacter does not match after a newline at the end of the
1181 subject, for compatibility with Perl. However, you can change this by
1182 setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a
1183 subject string, or no occurrences of ^ or $ in a pattern, setting
1184 PCRE2_MULTILINE has no effect.
1185
1186 PCRE2_NEVER_BACKSLASH_C
1187
1188 This option locks out the use of \C in the pattern that is being com‐
1189 piled. This escape can cause unpredictable behaviour in UTF-8 or
1190 UTF-16 modes, because it may leave the current matching point in the
1191 middle of a multi-code-unit character. This option may be useful in
1192 applications that process patterns from external sources. Note that
1193 there is also a build-time option that permanently locks out the use of
1194 \C.
1195
1196 PCRE2_NEVER_UCP
1197
1198 This option locks out the use of Unicode properties for handling \B,
1199 \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
1200 described for the PCRE2_UCP option below. In particular, it prevents
1201 the creator of the pattern from enabling this facility by starting the
1202 pattern with (*UCP). This option may be useful in applications that
1203 process patterns from external sources. The option combination PCRE_UCP
1204 and PCRE_NEVER_UCP causes an error.
1205
1206 PCRE2_NEVER_UTF
1207
1208 This option locks out interpretation of the pattern as UTF-8, UTF-16,
1209 or UTF-32, depending on which library is in use. In particular, it pre‐
1210 vents the creator of the pattern from switching to UTF interpretation
1211 by starting the pattern with (*UTF). This option may be useful in
1212 applications that process patterns from external sources. The combina‐
1213 tion of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
1214
1215 PCRE2_NO_AUTO_CAPTURE
1216
1217 If this option is set, it disables the use of numbered capturing paren‐
1218 theses in the pattern. Any opening parenthesis that is not followed by
1219 ? behaves as if it were followed by ?: but named parentheses can still
1220 be used for capturing (and they acquire numbers in the usual way).
1221 There is no equivalent of this option in Perl. Note that, if this
1222 option is set, references to capturing groups (back references or
1223 recursion/subroutine calls) may only refer to named groups, though the
1224 reference can be by name or by number.
1225
1226 PCRE2_NO_AUTO_POSSESS
1227
1228 If this option is set, it disables "auto-possessification", which is an
1229 optimization that, for example, turns a+b into a++b in order to avoid
1230 backtracks into a+ that can never be successful. However, if callouts
1231 are in use, auto-possessification means that some callouts are never
1232 taken. You can set this option if you want the matching functions to do
1233 a full unoptimized search and run all the callouts, but it is mainly
1234 provided for testing purposes.
1235
1236 PCRE2_NO_DOTSTAR_ANCHOR
1237
1238 If this option is set, it disables an optimization that is applied when
1239 .* is the first significant item in a top-level branch of a pattern,
1240 and all the other branches also start with .* or with \A or \G or ^.
1241 The optimization is automatically disabled for .* if it is inside an
1242 atomic group or a capturing group that is the subject of a back refer‐
1243 ence, or if the pattern contains (*PRUNE) or (*SKIP). When the opti‐
1244 mization is not disabled, such a pattern is automatically anchored if
1245 PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set
1246 for any ^ items. Otherwise, the fact that any match must start either
1247 at the start of the subject or following a newline is remembered. Like
1248 other optimizations, this can cause callouts to be skipped.
1249
1250 PCRE2_NO_START_OPTIMIZE
1251
1252 This is an option whose main effect is at matching time. It does not
1253 change what pcre2_compile() generates, but it does affect the output of
1254 the JIT compiler.
1255
1256 There are a number of optimizations that may occur at the start of a
1257 match, in order to speed up the process. For example, if it is known
1258 that an unanchored match must start with a specific character, the
1259 matching code searches the subject for that character, and fails imme‐
1260 diately if it cannot find it, without actually running the main match‐
1261 ing function. This means that a special item such as (*COMMIT) at the
1262 start of a pattern is not considered until after a suitable starting
1263 point for the match has been found. Also, when callouts or (*MARK)
1264 items are in use, these "start-up" optimizations can cause them to be
1265 skipped if the pattern is never actually used. The start-up optimiza‐
1266 tions are in effect a pre-scan of the subject that takes place before
1267 the pattern is run.
1268
1269 The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
1270 possibly causing performance to suffer, but ensuring that in cases
1271 where the result is "no match", the callouts do occur, and that items
1272 such as (*COMMIT) and (*MARK) are considered at every possible starting
1273 position in the subject string.
1274
1275 Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching
1276 operation. Consider the pattern
1277
1278 (*COMMIT)ABC
1279
1280 When this is compiled, PCRE2 records the fact that a match must start
1281 with the character "A". Suppose the subject string is "DEFABC". The
1282 start-up optimization scans along the subject, finds "A" and runs the
1283 first match attempt from there. The (*COMMIT) item means that the pat‐
1284 tern must match the current starting position, which in this case, it
1285 does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE
1286 set, the initial scan along the subject string does not happen. The
1287 first match attempt is run starting from "D" and when this fails,
1288 (*COMMIT) prevents any further matches being tried, so the overall
1289 result is "no match". There are also other start-up optimizations. For
1290 example, a minimum length for the subject may be recorded. Consider the
1291 pattern
1292
1293 (*MARK:A)(X|Y)
1294
1295 The minimum length for a match is one character. If the subject is
1296 "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
1297 to match an empty string at the end of the subject does not take place,
1298 because PCRE2 knows that the subject is now too short, and so the
1299 (*MARK) is never encountered. In this case, the optimization does not
1300 affect the overall match result, which is still "no match", but it does
1301 affect the auxiliary information that is returned.
1302
1303 PCRE2_NO_UTF_CHECK
1304
1305 When PCRE2_UTF is set, the validity of the pattern as a UTF string is
1306 automatically checked. There are discussions about the validity of
1307 UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode
1308 document. If an invalid UTF sequence is found, pcre2_compile() returns
1309 a negative error code.
1310
1311 If you know that your pattern is valid, and you want to skip this check
1312 for performance reasons, you can set the PCRE2_NO_UTF_CHECK option.
1313 When it is set, the effect of passing an invalid UTF string as a pat‐
1314 tern is undefined. It may cause your program to crash or loop. Note
1315 that this option can also be passed to pcre2_match() and
1316 pcre_dfa_match(), to suppress validity checking of the subject string.
1317
1318 PCRE2_UCP
1319
1320 This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
1321 \w, and some of the POSIX character classes. By default, only ASCII
1322 characters are recognized, but if PCRE2_UCP is set, Unicode properties
1323 are used instead to classify characters. More details are given in the
1324 section on generic character types in the pcre2pattern page. If you set
1325 PCRE2_UCP, matching one of the items it affects takes much longer. The
1326 option is available only if PCRE2 has been compiled with Unicode sup‐
1327 port.
1328
1329 PCRE2_UNGREEDY
1330
1331 This option inverts the "greediness" of the quantifiers so that they
1332 are not greedy by default, but become greedy if followed by "?". It is
1333 not compatible with Perl. It can also be set by a (?U) option setting
1334 within the pattern.
1335
1336 PCRE2_USE_OFFSET_LIMIT
1337
1338 This option must be set for pcre2_compile() if pcre2_set_offset_limit()
1339 is going to be used to set a non-default offset limit in a match con‐
1340 text for matches that use this pattern. An error is generated if an
1341 offset limit is set without this option. For more details, see the
1342 description of pcre2_set_offset_limit() in the section that describes
1343 match contexts. See also the PCRE2_FIRSTLINE option above.
1344
1345 PCRE2_UTF
1346
1347 This option causes PCRE2 to regard both the pattern and the subject
1348 strings that are subsequently processed as strings of UTF characters
1349 instead of single-code-unit strings. It is available when PCRE2 is
1350 built to include Unicode support (which is the default). If Unicode
1351 support is not available, the use of this option provokes an error.
1352 Details of how this option changes the behaviour of PCRE2 are given in
1353 the pcre2unicode page.
1354
1356
1357 There are over 80 positive error codes that pcre2_compile() may return
1358 (via errorcode) if it finds an error in the pattern. There are also
1359 some negative error codes that are used for invalid UTF strings. These
1360 are the same as given by pcre2_match() and pcre2_dfa_match(), and are
1361 described in the pcre2unicode page. The pcre2_get_error_message() func‐
1362 tion (see "Obtaining a textual error message" below) can be called to
1363 obtain a textual error message from any error code.
1364
1366
1367 int pcre2_jit_compile(pcre2_code *code, uint32_t options);
1368
1369 int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
1370 PCRE2_SIZE length, PCRE2_SIZE startoffset,
1371 uint32_t options, pcre2_match_data *match_data,
1372 pcre2_match_context *mcontext);
1373
1374 void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
1375
1376 pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
1377 PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
1378
1379 void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
1380 pcre2_jit_callback callback_function, void *callback_data);
1381
1382 void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
1383
1384 These functions provide support for JIT compilation, which, if the
1385 just-in-time compiler is available, further processes a compiled pat‐
1386 tern into machine code that executes much faster than the pcre2_match()
1387 interpretive matching function. Full details are given in the pcre2jit
1388 documentation.
1389
1390 JIT compilation is a heavyweight optimization. It can take some time
1391 for patterns to be analyzed, and for one-off matches and simple pat‐
1392 terns the benefit of faster execution might be offset by a much slower
1393 compilation time. Most, but not all patterns can be optimized by the
1394 JIT compiler.
1395
1397
1398 PCRE2 handles caseless matching, and determines whether characters are
1399 letters, digits, or whatever, by reference to a set of tables, indexed
1400 by character code point. This applies only to characters whose code
1401 points are less than 256. By default, higher-valued code points never
1402 match escapes such as \w or \d. However, if PCRE2 is built with UTF
1403 support, all characters can be tested with \p and \P, or, alterna‐
1404 tively, the PCRE2_UCP option can be set when a pattern is compiled;
1405 this causes \w and friends to use Unicode property support instead of
1406 the built-in tables.
1407
1408 The use of locales with Unicode is discouraged. If you are handling
1409 characters with code points greater than 128, you should either use
1410 Unicode support, or use locales, but not try to mix the two.
1411
1412 PCRE2 contains an internal set of character tables that are used by
1413 default. These are sufficient for many applications. Normally, the
1414 internal tables recognize only ASCII characters. However, when PCRE2 is
1415 built, it is possible to cause the internal tables to be rebuilt in the
1416 default "C" locale of the local system, which may cause them to be dif‐
1417 ferent.
1418
1419 The internal tables can be overridden by tables supplied by the appli‐
1420 cation that calls PCRE2. These may be created in a different locale
1421 from the default. As more and more applications change to using Uni‐
1422 code, the need for this locale support is expected to die away.
1423
1424 External tables are built by calling the pcre2_maketables() function,
1425 in the relevant locale. The result can be passed to pcre2_compile() as
1426 often as necessary, by creating a compile context and calling
1427 pcre2_set_character_tables() to set the tables pointer therein. For
1428 example, to build and use tables that are appropriate for the French
1429 locale (where accented characters with values greater than 128 are
1430 treated as letters), the following code could be used:
1431
1432 setlocale(LC_CTYPE, "fr_FR");
1433 tables = pcre2_maketables(NULL);
1434 ccontext = pcre2_compile_context_create(NULL);
1435 pcre2_set_character_tables(ccontext, tables);
1436 re = pcre2_compile(..., ccontext);
1437
1438 The locale name "fr_FR" is used on Linux and other Unix-like systems;
1439 if you are using Windows, the name for the French locale is "french".
1440 It is the caller's responsibility to ensure that the memory containing
1441 the tables remains available for as long as it is needed.
1442
1443 The pointer that is passed (via the compile context) to pcre2_compile()
1444 is saved with the compiled pattern, and the same tables are used by
1445 pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com‐
1446 pilation, and matching all happen in the same locale, but different
1447 patterns can be processed in different locales.
1448
1450
1451 int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
1452
1453 The pcre2_pattern_info() function returns general information about a
1454 compiled pattern. For information about callouts, see the next section.
1455 The first argument for pcre2_pattern_info() is a pointer to the com‐
1456 piled pattern. The second argument specifies which piece of information
1457 is required, and the third argument is a pointer to a variable to
1458 receive the data. If the third argument is NULL, the first argument is
1459 ignored, and the function returns the size in bytes of the variable
1460 that is required for the information requested. Otherwise, The yield of
1461 the function is zero for success, or one of the following negative num‐
1462 bers:
1463
1464 PCRE2_ERROR_NULL the argument code was NULL
1465 PCRE2_ERROR_BADMAGIC the "magic number" was not found
1466 PCRE2_ERROR_BADOPTION the value of what was invalid
1467 PCRE2_ERROR_UNSET the requested field is not set
1468
1469 The "magic number" is placed at the start of each compiled pattern as
1470 an simple check against passing an arbitrary memory pointer. Here is a
1471 typical call of pcre2_pattern_info(), to obtain the length of the com‐
1472 piled pattern:
1473
1474 int rc;
1475 size_t length;
1476 rc = pcre2_pattern_info(
1477 re, /* result of pcre2_compile() */
1478 PCRE2_INFO_SIZE, /* what is required */
1479 &length); /* where to put the data */
1480
1481 The possible values for the second argument are defined in pcre2.h, and
1482 are as follows:
1483
1484 PCRE2_INFO_ALLOPTIONS
1485 PCRE2_INFO_ARGOPTIONS
1486
1487 Return a copy of the pattern's options. The third argument should point
1488 to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the
1489 options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP‐
1490 TIONS returns the compile options as modified by any top-level (*XXX)
1491 option settings such as (*UTF) at the start of the pattern itself.
1492
1493 For example, if the pattern /(*UTF)abc/ is compiled with the
1494 PCRE2_EXTENDED option, the result for PCRE2_INFO_ALLOPTIONS is
1495 PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can
1496 change within a pattern do not affect the result of PCRE2_INFO_ALLOP‐
1497 TIONS, even if they appear right at the start of the pattern. (This was
1498 different in some earlier releases.)
1499
1500 A pattern compiled without PCRE2_ANCHORED is automatically anchored by
1501 PCRE2 if the first significant item in every top-level branch is one of
1502 the following:
1503
1504 ^ unless PCRE2_MULTILINE is set
1505 \A always
1506 \G always
1507 .* sometimes - see below
1508
1509 When .* is the first significant item, anchoring is possible only when
1510 all the following are true:
1511
1512 .* is not in an atomic group
1513 .* is not in a capturing group that is the subject
1514 of a back reference
1515 PCRE2_DOTALL is in force for .*
1516 Neither (*PRUNE) nor (*SKIP) appears in the pattern.
1517 PCRE2_NO_DOTSTAR_ANCHOR is not set.
1518
1519 For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in
1520 the options returned for PCRE2_INFO_ALLOPTIONS.
1521
1522 PCRE2_INFO_BACKREFMAX
1523
1524 Return the number of the highest back reference in the pattern. The
1525 third argument should point to an uint32_t variable. Named subpatterns
1526 acquire numbers as well as names, and these count towards the highest
1527 back reference. Back references such as \4 or \g{12} match the cap‐
1528 tured characters of the given group, but in addition, the check that a
1529 capturing group is set in a conditional subpattern such as (?(3)a|b) is
1530 also a back reference. Zero is returned if there are no back refer‐
1531 ences.
1532
1533 PCRE2_INFO_BSR
1534
1535 The output is a uint32_t whose value indicates what character sequences
1536 the \R escape sequence matches. A value of PCRE2_BSR_UNICODE means that
1537 \R matches any Unicode line ending sequence; a value of PCRE2_BSR_ANY‐
1538 CRLF means that \R matches only CR, LF, or CRLF.
1539
1540 PCRE2_INFO_CAPTURECOUNT
1541
1542 Return the highest capturing subpattern number in the pattern. In pat‐
1543 terns where (?| is not used, this is also the total number of capturing
1544 subpatterns. The third argument should point to an uint32_t variable.
1545
1546 PCRE2_INFO_FIRSTBITMAP
1547
1548 In the absence of a single first code unit for a non-anchored pattern,
1549 pcre2_compile() may construct a 256-bit table that defines a fixed set
1550 of values for the first code unit in any match. For example, a pattern
1551 that starts with [abc] results in a table with three bits set. When
1552 code unit values greater than 255 are supported, the flag bit for 255
1553 means "any code unit of value 255 or above". If such a table was con‐
1554 structed, a pointer to it is returned. Otherwise NULL is returned. The
1555 third argument should point to an const uint8_t * variable.
1556
1557 PCRE2_INFO_FIRSTCODETYPE
1558
1559 Return information about the first code unit of any matched string, for
1560 a non-anchored pattern. The third argument should point to an uint32_t
1561 variable. If there is a fixed first value, for example, the letter "c"
1562 from a pattern such as (cat|cow|coyote), 1 is returned, and the charac‐
1563 ter value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is
1564 no fixed first value, but it is known that a match can occur only at
1565 the start of the subject or following a newline in the subject, 2 is
1566 returned. Otherwise, and for anchored patterns, 0 is returned.
1567
1568 PCRE2_INFO_FIRSTCODEUNIT
1569
1570 Return the value of the first code unit of any matched string in the
1571 situation where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
1572 The third argument should point to an uint32_t variable. In the 8-bit
1573 library, the value is always less than 256. In the 16-bit library the
1574 value can be up to 0xffff. In the 32-bit library in UTF-32 mode the
1575 value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
1576 mode.
1577
1578 PCRE2_INFO_HASBACKSLASHC
1579
1580 Return 1 if the pattern contains any instances of \C, otherwise 0. The
1581 third argument should point to an uint32_t variable.
1582
1583 PCRE2_INFO_HASCRORLF
1584
1585 Return 1 if the pattern contains any explicit matches for CR or LF
1586 characters, otherwise 0. The third argument should point to an uint32_t
1587 variable. An explicit match is either a literal CR or LF character, or
1588 \r or \n.
1589
1590 PCRE2_INFO_JCHANGED
1591
1592 Return 1 if the (?J) or (?-J) option setting is used in the pattern,
1593 otherwise 0. The third argument should point to an uint32_t variable.
1594 (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respec‐
1595 tively.
1596
1597 PCRE2_INFO_JITSIZE
1598
1599 If the compiled pattern was successfully processed by pcre2_jit_com‐
1600 pile(), return the size of the JIT compiled code, otherwise return
1601 zero. The third argument should point to a size_t variable.
1602
1603 PCRE2_INFO_LASTCODETYPE
1604
1605 Returns 1 if there is a rightmost literal code unit that must exist in
1606 any matched string, other than at its start. The third argument should
1607 point to an uint32_t variable. If there is no such value, 0 is
1608 returned. When 1 is returned, the code unit value itself can be
1609 retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last
1610 literal value is recorded only if it follows something of variable
1611 length. For example, for the pattern /^a\d+z\d+/ the returned value is
1612 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/
1613 the returned value is 0.
1614
1615 PCRE2_INFO_LASTCODEUNIT
1616
1617 Return the value of the rightmost literal data unit that must exist in
1618 any matched string, other than at its start, if such a value has been
1619 recorded. The third argument should point to an uint32_t variable. If
1620 there is no such value, 0 is returned.
1621
1622 PCRE2_INFO_MATCHEMPTY
1623
1624 Return 1 if the pattern might match an empty string, otherwise 0. The
1625 third argument should point to an uint32_t variable. When a pattern
1626 contains recursive subroutine calls it is not always possible to deter‐
1627 mine whether or not it can match an empty string. PCRE2 takes a cau‐
1628 tious approach and returns 1 in such cases.
1629
1630 PCRE2_INFO_MATCHLIMIT
1631
1632 If the pattern set a match limit by including an item of the form
1633 (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third
1634 argument should point to an unsigned 32-bit integer. If no such value
1635 has been set, the call to pcre2_pattern_info() returns the error
1636 PCRE2_ERROR_UNSET.
1637
1638 PCRE2_INFO_MAXLOOKBEHIND
1639
1640 Return the number of characters (not code units) in the longest lookbe‐
1641 hind assertion in the pattern. The third argument should point to an
1642 unsigned 32-bit integer. This information is useful when doing multi-
1643 segment matching using the partial matching facilities. Note that the
1644 simple assertions \b and \B require a one-character lookbehind. \A also
1645 registers a one-character lookbehind, though it does not actually
1646 inspect the previous character. This is to ensure that at least one
1647 character from the old segment is retained when a new segment is pro‐
1648 cessed. Otherwise, if there are no lookbehinds in the pattern, \A might
1649 match incorrectly at the start of a new segment.
1650
1651 PCRE2_INFO_MINLENGTH
1652
1653 If a minimum length for matching subject strings was computed, its
1654 value is returned. Otherwise the returned value is 0. The value is a
1655 number of characters, which in UTF mode may be different from the num‐
1656 ber of code units. The third argument should point to an uint32_t
1657 variable. The value is a lower bound to the length of any matching
1658 string. There may not be any strings of that length that do actually
1659 match, but every string that does match is at least that long.
1660
1661 PCRE2_INFO_NAMECOUNT
1662 PCRE2_INFO_NAMEENTRYSIZE
1663 PCRE2_INFO_NAMETABLE
1664
1665 PCRE2 supports the use of named as well as numbered capturing parenthe‐
1666 ses. The names are just an additional way of identifying the parenthe‐
1667 ses, which still acquire numbers. Several convenience functions such as
1668 pcre2_substring_get_byname() are provided for extracting captured sub‐
1669 strings by name. It is also possible to extract the data directly, by
1670 first converting the name to a number in order to access the correct
1671 pointers in the output vector (described with pcre2_match() below). To
1672 do the conversion, you need to use the name-to-number map, which is
1673 described by these three values.
1674
1675 The map consists of a number of fixed-size entries. PCRE2_INFO_NAME‐
1676 COUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
1677 the size of each entry in code units; both of these return a uint32_t
1678 value. The entry size depends on the length of the longest name.
1679
1680 PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
1681 This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit
1682 library, the first two bytes of each entry are the number of the cap‐
1683 turing parenthesis, most significant byte first. In the 16-bit library,
1684 the pointer points to 16-bit code units, the first of which contains
1685 the parenthesis number. In the 32-bit library, the pointer points to
1686 32-bit code units, the first of which contains the parenthesis number.
1687 The rest of the entry is the corresponding name, zero terminated.
1688
1689 The names are in alphabetical order. If (?| is used to create multiple
1690 groups with the same number, as described in the section on duplicate
1691 subpattern numbers in the pcre2pattern page, the groups may be given
1692 the same name, but there is only one entry in the table. Different
1693 names for groups of the same number are not permitted.
1694
1695 Duplicate names for subpatterns with different numbers are permitted,
1696 but only if PCRE2_DUPNAMES is set. They appear in the table in the
1697 order in which they were found in the pattern. In the absence of (?|
1698 this is the order of increasing number; when (?| is used this is not
1699 necessarily the case because later subpatterns may have lower numbers.
1700
1701 As a simple example of the name/number table, consider the following
1702 pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED
1703 is set, so white space - including newlines - is ignored):
1704
1705 (?<date> (?<year>(\d\d)?\d\d) -
1706 (?<month>\d\d) - (?<day>\d\d) )
1707
1708 There are four named subpatterns, so the table has four entries, and
1709 each entry in the table is eight bytes long. The table is as follows,
1710 with non-printing bytes shows in hexadecimal, and undefined bytes shown
1711 as ??:
1712
1713 00 01 d a t e 00 ??
1714 00 05 d a y 00 ?? ??
1715 00 04 m o n t h 00
1716 00 02 y e a r 00 ??
1717
1718 When writing code to extract data from named subpatterns using the
1719 name-to-number map, remember that the length of the entries is likely
1720 to be different for each compiled pattern.
1721
1722 PCRE2_INFO_NEWLINE
1723
1724 The output is a uint32_t with one of the following values:
1725
1726 PCRE2_NEWLINE_CR Carriage return (CR)
1727 PCRE2_NEWLINE_LF Linefeed (LF)
1728 PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF)
1729 PCRE2_NEWLINE_ANY Any Unicode line ending
1730 PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF
1731
1732 This specifies the default character sequence that will be recognized
1733 as meaning "newline" while matching.
1734
1735 PCRE2_INFO_RECURSIONLIMIT
1736
1737 If the pattern set a recursion limit by including an item of the form
1738 (*LIMIT_RECURSION=nnnn) at the start, the value is returned. The third
1739 argument should point to an unsigned 32-bit integer. If no such value
1740 has been set, the call to pcre2_pattern_info() returns the error
1741 PCRE2_ERROR_UNSET.
1742
1743 PCRE2_INFO_SIZE
1744
1745 Return the size of the compiled pattern in bytes (for all three
1746 libraries). The third argument should point to a size_t variable. This
1747 value includes the size of the general data block that precedes the
1748 code units of the compiled pattern itself. The value that is used when
1749 pcre2_compile() is getting memory in which to place the compiled pat‐
1750 tern may be slightly larger than the value returned by this option,
1751 because there are cases where the code that calculates the size has to
1752 over-estimate. Processing a pattern with the JIT compiler does not
1753 alter the value returned by this option.
1754
1756
1757 int pcre2_callout_enumerate(const pcre2_code *code,
1758 int (*callback)(pcre2_callout_enumerate_block *, void *),
1759 void *user_data);
1760
1761 A script language that supports the use of string arguments in callouts
1762 might like to scan all the callouts in a pattern before running the
1763 match. This can be done by calling pcre2_callout_enumerate(). The first
1764 argument is a pointer to a compiled pattern, the second points to a
1765 callback function, and the third is arbitrary user data. The callback
1766 function is called for every callout in the pattern in the order in
1767 which they appear. Its first argument is a pointer to a callout enumer‐
1768 ation block, and its second argument is the user_data value that was
1769 passed to pcre2_callout_enumerate(). The contents of the callout enu‐
1770 meration block are described in the pcre2callout documentation, which
1771 also gives further details about callouts.
1772
1774
1775 It is possible to save compiled patterns on disc or elsewhere, and
1776 reload them later, subject to a number of restrictions. The functions
1777 whose names begin with pcre2_serialize_ are used for this purpose. They
1778 are described in the pcre2serialize documentation.
1779
1781
1782 pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
1783 pcre2_general_context *gcontext);
1784
1785 pcre2_match_data *pcre2_match_data_create_from_pattern(
1786 const pcre2_code *code, pcre2_general_context *gcontext);
1787
1788 void pcre2_match_data_free(pcre2_match_data *match_data);
1789
1790 Information about a successful or unsuccessful match is placed in a
1791 match data block, which is an opaque structure that is accessed by
1792 function calls. In particular, the match data block contains a vector
1793 of offsets into the subject string that define the matched part of the
1794 subject and any substrings that were captured. This is known as the
1795 ovector.
1796
1797 Before calling pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match()
1798 you must create a match data block by calling one of the creation func‐
1799 tions above. For pcre2_match_data_create(), the first argument is the
1800 number of pairs of offsets in the ovector. One pair of offsets is
1801 required to identify the string that matched the whole pattern, with
1802 another pair for each captured substring. For example, a value of 4
1803 creates enough space to record the matched portion of the subject plus
1804 three captured substrings. A minimum of at least 1 pair is imposed by
1805 pcre2_match_data_create(), so it is always possible to return the over‐
1806 all matched string.
1807
1808 The second argument of pcre2_match_data_create() is a pointer to a gen‐
1809 eral context, which can specify custom memory management for obtaining
1810 the memory for the match data block. If you are not using custom memory
1811 management, pass NULL, which causes malloc() to be used.
1812
1813 For pcre2_match_data_create_from_pattern(), the first argument is a
1814 pointer to a compiled pattern. The ovector is created to be exactly the
1815 right size to hold all the substrings a pattern might capture. The sec‐
1816 ond argument is again a pointer to a general context, but in this case
1817 if NULL is passed, the memory is obtained using the same allocator that
1818 was used for the compiled pattern (custom or default).
1819
1820 A match data block can be used many times, with the same or different
1821 compiled patterns. You can extract information from a match data block
1822 after a match operation has finished, using functions that are
1823 described in the sections on matched strings and other match data
1824 below.
1825
1826 When a call of pcre2_match() fails, valid data is available in the
1827 match block only when the error is PCRE2_ERROR_NOMATCH,
1828 PCRE2_ERROR_PARTIAL, or one of the error codes for an invalid UTF
1829 string. Exactly what is available depends on the error, and is detailed
1830 below.
1831
1832 When one of the matching functions is called, pointers to the compiled
1833 pattern and the subject string are set in the match data block so that
1834 they can be referenced by the extraction functions. After running a
1835 match, you must not free a compiled pattern or a subject string until
1836 after all operations on the match data block (for that match) have
1837 taken place.
1838
1839 When a match data block itself is no longer needed, it should be freed
1840 by calling pcre2_match_data_free().
1841
1843
1844 int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
1845 PCRE2_SIZE length, PCRE2_SIZE startoffset,
1846 uint32_t options, pcre2_match_data *match_data,
1847 pcre2_match_context *mcontext);
1848
1849 The function pcre2_match() is called to match a subject string against
1850 a compiled pattern, which is passed in the code argument. You can call
1851 pcre2_match() with the same code argument as many times as you like, in
1852 order to find multiple matches in the subject string or to match dif‐
1853 ferent subject strings with the same pattern.
1854
1855 This function is the main matching facility of the library, and it
1856 operates in a Perl-like manner. For specialist use there is also an
1857 alternative matching function, which is described below in the section
1858 about the pcre2_dfa_match() function.
1859
1860 Here is an example of a simple call to pcre2_match():
1861
1862 pcre2_match_data *md = pcre2_match_data_create(4, NULL);
1863 int rc = pcre2_match(
1864 re, /* result of pcre2_compile() */
1865 "some string", /* the subject string */
1866 11, /* the length of the subject string */
1867 0, /* start at offset 0 in the subject */
1868 0, /* default options */
1869 match_data, /* the match data block */
1870 NULL); /* a match context; NULL means use defaults */
1871
1872 If the subject string is zero-terminated, the length can be given as
1873 PCRE2_ZERO_TERMINATED. A match context must be provided if certain less
1874 common matching parameters are to be changed. For details, see the sec‐
1875 tion on the match context above.
1876
1877 The string to be matched by pcre2_match()
1878
1879 The subject string is passed to pcre2_match() as a pointer in subject,
1880 a length in length, and a starting offset in startoffset. The length
1881 and offset are in code units, not characters. That is, they are in
1882 bytes for the 8-bit library, 16-bit code units for the 16-bit library,
1883 and 32-bit code units for the 32-bit library, whether or not UTF pro‐
1884 cessing is enabled.
1885
1886 If startoffset is greater than the length of the subject, pcre2_match()
1887 returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the
1888 search for a match starts at the beginning of the subject, and this is
1889 by far the most common case. In UTF-8 or UTF-16 mode, the starting off‐
1890 set must point to the start of a character, or to the end of the sub‐
1891 ject (in UTF-32 mode, one code unit equals one character, so all off‐
1892 sets are valid). Like the pattern string, the subject may contain
1893 binary zeroes.
1894
1895 A non-zero starting offset is useful when searching for another match
1896 in the same subject by calling pcre2_match() again after a previous
1897 success. Setting startoffset differs from passing over a shortened
1898 string and setting PCRE2_NOTBOL in the case of a pattern that begins
1899 with any kind of lookbehind. For example, consider the pattern
1900
1901 \Biss\B
1902
1903 which finds occurrences of "iss" in the middle of words. (\B matches
1904 only if the current position in the subject is not a word boundary.)
1905 When applied to the string "Mississipi" the first call to pcre2_match()
1906 finds the first occurrence. If pcre2_match() is called again with just
1907 the remainder of the subject, namely "issipi", it does not match,
1908 because \B is always false at the start of the subject, which is deemed
1909 to be a word boundary. However, if pcre2_match() is passed the entire
1910 string again, but with startoffset set to 4, it finds the second occur‐
1911 rence of "iss" because it is able to look behind the starting point to
1912 discover that it is preceded by a letter.
1913
1914 Finding all the matches in a subject is tricky when the pattern can
1915 match an empty string. It is possible to emulate Perl's /g behaviour by
1916 first trying the match again at the same offset, with the
1917 PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED options, and then if that
1918 fails, advancing the starting offset and trying an ordinary match
1919 again. There is some code that demonstrates how to do this in the
1920 pcre2demo sample program. In the most general case, you have to check
1921 to see if the newline convention recognizes CRLF as a newline, and if
1922 so, and the current character is CR followed by LF, advance the start‐
1923 ing offset by two characters instead of one.
1924
1925 If a non-zero starting offset is passed when the pattern is anchored,
1926 one attempt to match at the given offset is made. This can only succeed
1927 if the pattern does not require the match to be at the start of the
1928 subject.
1929
1930 Option bits for pcre2_match()
1931
1932 The unused bits of the options argument for pcre2_match() must be zero.
1933 The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
1934 PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT,
1935 PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their
1936 action is described below.
1937
1938 Setting PCRE2_ANCHORED at match time is not supported by the just-in-
1939 time (JIT) compiler. If it is set, JIT matching is disabled and the
1940 normal interpretive code in pcre2_match() is run. Apart from
1941 PCRE2_NO_JIT (obviously), the remaining options are supported for JIT
1942 matching.
1943
1944 PCRE2_ANCHORED
1945
1946 The PCRE2_ANCHORED option limits pcre2_match() to matching at the first
1947 matching position. If a pattern was compiled with PCRE2_ANCHORED, or
1948 turned out to be anchored by virtue of its contents, it cannot be made
1949 unachored at matching time. Note that setting the option at match time
1950 disables JIT matching.
1951
1952 PCRE2_NOTBOL
1953
1954 This option specifies that first character of the subject string is not
1955 the beginning of a line, so the circumflex metacharacter should not
1956 match before it. Setting this without having set PCRE2_MULTILINE at
1957 compile time causes circumflex never to match. This option affects only
1958 the behaviour of the circumflex metacharacter. It does not affect \A.
1959
1960 PCRE2_NOTEOL
1961
1962 This option specifies that the end of the subject string is not the end
1963 of a line, so the dollar metacharacter should not match it nor (except
1964 in multiline mode) a newline immediately before it. Setting this with‐
1965 out having set PCRE2_MULTILINE at compile time causes dollar never to
1966 match. This option affects only the behaviour of the dollar metacharac‐
1967 ter. It does not affect \Z or \z.
1968
1969 PCRE2_NOTEMPTY
1970
1971 An empty string is not considered to be a valid match if this option is
1972 set. If there are alternatives in the pattern, they are tried. If all
1973 the alternatives match the empty string, the entire match fails. For
1974 example, if the pattern
1975
1976 a?b?
1977
1978 is applied to a string not beginning with "a" or "b", it matches an
1979 empty string at the start of the subject. With PCRE2_NOTEMPTY set, this
1980 match is not valid, so pcre2_match() searches further into the string
1981 for occurrences of "a" or "b".
1982
1983 PCRE2_NOTEMPTY_ATSTART
1984
1985 This is like PCRE2_NOTEMPTY, except that it locks out an empty string
1986 match only at the first matching position, that is, at the start of the
1987 subject plus the starting offset. An empty string match later in the
1988 subject is permitted. If the pattern is anchored, such a match can
1989 occur only if the pattern contains \K.
1990
1991 PCRE2_NO_JIT
1992
1993 By default, if a pattern has been successfully processed by
1994 pcre2_jit_compile(), JIT is automatically used when pcre2_match() is
1995 called with options that JIT supports. Setting PCRE2_NO_JIT disables
1996 the use of JIT; it forces matching to be done by the interpreter.
1997
1998 PCRE2_NO_UTF_CHECK
1999
2000 When PCRE2_UTF is set at compile time, the validity of the subject as a
2001 UTF string is checked by default when pcre2_match() is subsequently
2002 called. If a non-zero starting offset is given, the check is applied
2003 only to that part of the subject that could be inspected during match‐
2004 ing, and there is a check that the starting offset points to the first
2005 code unit of a character or to the end of the subject. If there are no
2006 lookbehind assertions in the pattern, the check starts at the starting
2007 offset. Otherwise, it starts at the length of the longest lookbehind
2008 before the starting offset, or at the start of the subject if there are
2009 not that many characters before the starting offset. Note that the
2010 sequences \b and \B are one-character lookbehinds.
2011
2012 The check is carried out before any other processing takes place, and a
2013 negative error code is returned if the check fails. There are several
2014 UTF error codes for each code unit width, corresponding to different
2015 problems with the code unit sequence. There are discussions about the
2016 validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the
2017 pcre2unicode page.
2018
2019 If you know that your subject is valid, and you want to skip these
2020 checks for performance reasons, you can set the PCRE2_NO_UTF_CHECK
2021 option when calling pcre2_match(). You might want to do this for the
2022 second and subsequent calls to pcre2_match() if you are making repeated
2023 calls to find all the matches in a single subject string.
2024
2025 NOTE: When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid
2026 string as a subject, or an invalid value of startoffset, is undefined.
2027 Your program may crash or loop indefinitely.
2028
2029 PCRE2_PARTIAL_HARD
2030 PCRE2_PARTIAL_SOFT
2031
2032 These options turn on the partial matching feature. A partial match
2033 occurs if the end of the subject string is reached successfully, but
2034 there are not enough subject characters to complete the match. If this
2035 happens when PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set,
2036 matching continues by testing any remaining alternatives. Only if no
2037 complete match can be found is PCRE2_ERROR_PARTIAL returned instead of
2038 PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT specifies that
2039 the caller is prepared to handle a partial match, but only if no com‐
2040 plete match can be found.
2041
2042 If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this
2043 case, if a partial match is found, pcre2_match() immediately returns
2044 PCRE2_ERROR_PARTIAL, without considering any other alternatives. In
2045 other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid‐
2046 ered to be more important that an alternative complete match.
2047
2048 There is a more detailed discussion of partial and multi-segment match‐
2049 ing, with examples, in the pcre2partial documentation.
2050
2052
2053 When PCRE2 is built, a default newline convention is set; this is usu‐
2054 ally the standard convention for the operating system. The default can
2055 be overridden in a compile context by calling pcre2_set_newline(). It
2056 can also be overridden by starting a pattern string with, for example,
2057 (*CRLF), as described in the section on newline conventions in the
2058 pcre2pattern page. During matching, the newline choice affects the be‐
2059 haviour of the dot, circumflex, and dollar metacharacters. It may also
2060 alter the way the match starting position is advanced after a match
2061 failure for an unanchored pattern.
2062
2063 When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is
2064 set as the newline convention, and a match attempt for an unanchored
2065 pattern fails when the current starting position is at a CRLF sequence,
2066 and the pattern contains no explicit matches for CR or LF characters,
2067 the match position is advanced by two characters instead of one, in
2068 other words, to after the CRLF.
2069
2070 The above rule is a compromise that makes the most common cases work as
2071 expected. For example, if the pattern is .+A (and the PCRE2_DOTALL
2072 option is not set), it does not match the string "\r\nA" because, after
2073 failing at the start, it skips both the CR and the LF before retrying.
2074 However, the pattern [\r\n]A does match that string, because it con‐
2075 tains an explicit CR or LF reference, and so advances only by one char‐
2076 acter after the first failure.
2077
2078 An explicit match for CR of LF is either a literal appearance of one of
2079 those characters in the pattern, or one of the \r or \n escape
2080 sequences. Implicit matches such as [^X] do not count, nor does \s,
2081 even though it includes CR and LF in the characters that it matches.
2082
2083 Notwithstanding the above, anomalous effects may still occur when CRLF
2084 is a valid newline sequence and explicit \r or \n escapes appear in the
2085 pattern.
2086
2088
2089 uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
2090
2091 PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
2092
2093 In general, a pattern matches a certain portion of the subject, and in
2094 addition, further substrings from the subject may be picked out by
2095 parenthesized parts of the pattern. Following the usage in Jeffrey
2096 Friedl's book, this is called "capturing" in what follows, and the
2097 phrase "capturing subpattern" or "capturing group" is used for a frag‐
2098 ment of a pattern that picks out a substring. PCRE2 supports several
2099 other kinds of parenthesized subpattern that do not cause substrings to
2100 be captured. The pcre2_pattern_info() function can be used to find out
2101 how many capturing subpatterns there are in a compiled pattern.
2102
2103 You can use auxiliary functions for accessing captured substrings by
2104 number or by name, as described in sections below.
2105
2106 Alternatively, you can make direct use of the vector of PCRE2_SIZE val‐
2107 ues, called the ovector, which contains the offsets of captured
2108 strings. It is part of the match data block. The function
2109 pcre2_get_ovector_pointer() returns the address of the ovector, and
2110 pcre2_get_ovector_count() returns the number of pairs of values it con‐
2111 tains.
2112
2113 Within the ovector, the first in each pair of values is set to the off‐
2114 set of the first code unit of a substring, and the second is set to the
2115 offset of the first code unit after the end of a substring. These val‐
2116 ues are always code unit offsets, not character offsets. That is, they
2117 are byte offsets in the 8-bit library, 16-bit offsets in the 16-bit
2118 library, and 32-bit offsets in the 32-bit library.
2119
2120 After a partial match (error return PCRE2_ERROR_PARTIAL), only the
2121 first pair of offsets (that is, ovector[0] and ovector[1]) are set.
2122 They identify the part of the subject that was partially matched. See
2123 the pcre2partial documentation for details of partial matching.
2124
2125 After a successful match, the first pair of offsets identifies the por‐
2126 tion of the subject string that was matched by the entire pattern. The
2127 next pair is used for the first capturing subpattern, and so on. The
2128 value returned by pcre2_match() is one more than the highest numbered
2129 pair that has been set. For example, if two substrings have been cap‐
2130 tured, the returned value is 3. If there are no capturing subpatterns,
2131 the return value from a successful match is 1, indicating that just the
2132 first pair of offsets has been set.
2133
2134 If a pattern uses the \K escape sequence within a positive assertion,
2135 the reported start of a successful match can be greater than the end of
2136 the match. For example, if the pattern (?=ab\K) is matched against
2137 "ab", the start and end offset values for the match are 2 and 0.
2138
2139 If a capturing subpattern group is matched repeatedly within a single
2140 match operation, it is the last portion of the subject that it matched
2141 that is returned.
2142
2143 If the ovector is too small to hold all the captured substring offsets,
2144 as much as possible is filled in, and the function returns a value of
2145 zero. If captured substrings are not of interest, pcre2_match() may be
2146 called with a match data block whose ovector is of minimum length (that
2147 is, one pair). However, if the pattern contains back references and the
2148 ovector is not big enough to remember the related substrings, PCRE2 has
2149 to get additional memory for use during matching. Thus it is usually
2150 advisable to set up a match data block containing an ovector of reason‐
2151 able size.
2152
2153 It is possible for capturing subpattern number n+1 to match some part
2154 of the subject when subpattern n has not been used at all. For example,
2155 if the string "abc" is matched against the pattern (a|(z))(bc) the
2156 return from the function is 4, and subpatterns 1 and 3 are matched, but
2157 2 is not. When this happens, both values in the offset pairs corre‐
2158 sponding to unused subpatterns are set to PCRE2_UNSET.
2159
2160 Offset values that correspond to unused subpatterns at the end of the
2161 expression are also set to PCRE2_UNSET. For example, if the string
2162 "abc" is matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3
2163 are not matched. The return from the function is 2, because the high‐
2164 est used capturing subpattern number is 1. The offsets for for the sec‐
2165 ond and third capturing subpatterns (assuming the vector is large
2166 enough, of course) are set to PCRE2_UNSET.
2167
2168 Elements in the ovector that do not correspond to capturing parentheses
2169 in the pattern are never changed. That is, if a pattern contains n cap‐
2170 turing parentheses, no more than ovector[0] to ovector[2n+1] are set by
2171 pcre2_match(). The other elements retain whatever values they previ‐
2172 ously had.
2173
2175
2176 PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
2177
2178 PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
2179
2180 As well as the offsets in the ovector, other information about a match
2181 is retained in the match data block and can be retrieved by the above
2182 functions in appropriate circumstances. If they are called at other
2183 times, the result is undefined.
2184
2185 After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a
2186 failure to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be avail‐
2187 able, and pcre2_get_mark() can be called. It returns a pointer to the
2188 zero-terminated name, which is within the compiled pattern. Otherwise
2189 NULL is returned. The length of the (*MARK) name (excluding the termi‐
2190 nating zero) is stored in the code unit that preceeds the name. You
2191 should use this instead of relying on the terminating zero if the
2192 (*MARK) name might contain a binary zero.
2193
2194 After a successful match, the (*MARK) name that is returned is the last
2195 one encountered on the matching path through the pattern. After a "no
2196 match" or a partial match, the last encountered (*MARK) name is
2197 returned. For example, consider this pattern:
2198
2199 ^(*MARK:A)((*MARK:B)a|b)c
2200
2201 When it matches "bc", the returned mark is A. The B mark is "seen" in
2202 the first branch of the group, but it is not on the matching path. On
2203 the other hand, when this pattern fails to match "bx", the returned
2204 mark is B.
2205
2206 After a successful match, a partial match, or one of the invalid UTF
2207 errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can
2208 be called. After a successful or partial match it returns the code unit
2209 offset of the character at which the match started. For a non-partial
2210 match, this can be different to the value of ovector[0] if the pattern
2211 contains the \K escape sequence. After a partial match, however, this
2212 value is always the same as ovector[0] because \K does not affect the
2213 result of a partial match.
2214
2215 After a UTF check failure, pcre2_get_startchar() can be used to obtain
2216 the code unit offset of the invalid UTF character. Details are given in
2217 the pcre2unicode page.
2218
2220
2221 If pcre2_match() fails, it returns a negative number. This can be con‐
2222 verted to a text string by calling the pcre2_get_error_message() func‐
2223 tion (see "Obtaining a textual error message" below). Negative error
2224 codes are also returned by other functions, and are documented with
2225 them. The codes are given names in the header file. If UTF checking is
2226 in force and an invalid UTF subject string is detected, one of a number
2227 of UTF-specific negative error codes is returned. Details are given in
2228 the pcre2unicode page. The following are the other errors that may be
2229 returned by pcre2_match():
2230
2231 PCRE2_ERROR_NOMATCH
2232
2233 The subject string did not match the pattern.
2234
2235 PCRE2_ERROR_PARTIAL
2236
2237 The subject string did not match, but it did match partially. See the
2238 pcre2partial documentation for details of partial matching.
2239
2240 PCRE2_ERROR_BADMAGIC
2241
2242 PCRE2 stores a 4-byte "magic number" at the start of the compiled code,
2243 to catch the case when it is passed a junk pointer. This is the error
2244 that is returned when the magic number is not present.
2245
2246 PCRE2_ERROR_BADMODE
2247
2248 This error is given when a pattern that was compiled by the 8-bit
2249 library is passed to a 16-bit or 32-bit library function, or vice
2250 versa.
2251
2252 PCRE2_ERROR_BADOFFSET
2253
2254 The value of startoffset was greater than the length of the subject.
2255
2256 PCRE2_ERROR_BADOPTION
2257
2258 An unrecognized bit was set in the options argument.
2259
2260 PCRE2_ERROR_BADUTFOFFSET
2261
2262 The UTF code unit sequence that was passed as a subject was checked and
2263 found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the
2264 value of startoffset did not point to the beginning of a UTF character
2265 or the end of the subject.
2266
2267 PCRE2_ERROR_CALLOUT
2268
2269 This error is never generated by pcre2_match() itself. It is provided
2270 for use by callout functions that want to cause pcre2_match() or
2271 pcre2_callout_enumerate() to return a distinctive error code. See the
2272 pcre2callout documentation for details.
2273
2274 PCRE2_ERROR_INTERNAL
2275
2276 An unexpected internal error has occurred. This error could be caused
2277 by a bug in PCRE2 or by overwriting of the compiled pattern.
2278
2279 PCRE2_ERROR_JIT_BADOPTION
2280
2281 This error is returned when a pattern that was successfully studied
2282 using JIT is being matched, but the matching mode (partial or complete
2283 match) does not correspond to any JIT compilation mode. When the JIT
2284 fast path function is used, this error may be also given for invalid
2285 options. See the pcre2jit documentation for more details.
2286
2287 PCRE2_ERROR_JIT_STACKLIMIT
2288
2289 This error is returned when a pattern that was successfully studied
2290 using JIT is being matched, but the memory available for the just-in-
2291 time processing stack is not large enough. See the pcre2jit documenta‐
2292 tion for more details.
2293
2294 PCRE2_ERROR_MATCHLIMIT
2295
2296 The backtracking limit was reached.
2297
2298 PCRE2_ERROR_NOMEMORY
2299
2300 If a pattern contains back references, but the ovector is not big
2301 enough to remember the referenced substrings, PCRE2 gets a block of
2302 memory at the start of matching to use for this purpose. There are some
2303 other special cases where extra memory is needed during matching. This
2304 error is given when memory cannot be obtained.
2305
2306 PCRE2_ERROR_NULL
2307
2308 Either the code, subject, or match_data argument was passed as NULL.
2309
2310 PCRE2_ERROR_RECURSELOOP
2311
2312 This error is returned when pcre2_match() detects a recursion loop
2313 within the pattern. Specifically, it means that either the whole pat‐
2314 tern or a subpattern has been called recursively for the second time at
2315 the same position in the subject string. Some simple patterns that
2316 might do this are detected and faulted at compile time, but more com‐
2317 plicated cases, in particular mutual recursions between two different
2318 subpatterns, cannot be detected until matching is attempted.
2319
2320 PCRE2_ERROR_RECURSIONLIMIT
2321
2322 The internal recursion limit was reached.
2323
2325
2326 int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
2327 PCRE2_SIZE bufflen);
2328
2329 A text message for an error code from any PCRE2 function (compile,
2330 match, or auxiliary) can be obtained by calling pcre2_get_error_mes‐
2331 sage(). The code is passed as the first argument, with the remaining
2332 two arguments specifying a code unit buffer and its length, into which
2333 the text message is placed. Note that the message is returned in code
2334 units of the appropriate width for the library that is being used.
2335
2336 The returned message is terminated with a trailing zero, and the func‐
2337 tion returns the number of code units used, excluding the trailing
2338 zero. If the error number is unknown, the negative error code
2339 PCRE2_ERROR_BADDATA is returned. If the buffer is too small, the mes‐
2340 sage is truncated (but still with a trailing zero), and the negative
2341 error code PCRE2_ERROR_NOMEMORY is returned. None of the messages are
2342 very long; a buffer size of 120 code units is ample.
2343
2345
2346 int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
2347 uint32_t number, PCRE2_SIZE *length);
2348
2349 int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
2350 uint32_t number, PCRE2_UCHAR *buffer,
2351 PCRE2_SIZE *bufflen);
2352
2353 int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
2354 uint32_t number, PCRE2_UCHAR **bufferptr,
2355 PCRE2_SIZE *bufflen);
2356
2357 void pcre2_substring_free(PCRE2_UCHAR *buffer);
2358
2359 Captured substrings can be accessed directly by using the ovector as
2360 described above. For convenience, auxiliary functions are provided for
2361 extracting captured substrings as new, separate, zero-terminated
2362 strings. A substring that contains a binary zero is correctly extracted
2363 and has a further zero added on the end, but the result is not, of
2364 course, a C string.
2365
2366 The functions in this section identify substrings by number. The number
2367 zero refers to the entire matched substring, with higher numbers refer‐
2368 ring to substrings captured by parenthesized groups. After a partial
2369 match, only substring zero is available. An attempt to extract any
2370 other substring gives the error PCRE2_ERROR_PARTIAL. The next section
2371 describes similar functions for extracting captured substrings by name.
2372
2373 If a pattern uses the \K escape sequence within a positive assertion,
2374 the reported start of a successful match can be greater than the end of
2375 the match. For example, if the pattern (?=ab\K) is matched against
2376 "ab", the start and end offset values for the match are 2 and 0. In
2377 this situation, calling these functions with a zero substring number
2378 extracts a zero-length empty string.
2379
2380 You can find the length in code units of a captured substring without
2381 extracting it by calling pcre2_substring_length_bynumber(). The first
2382 argument is a pointer to the match data block, the second is the group
2383 number, and the third is a pointer to a variable into which the length
2384 is placed. If you just want to know whether or not the substring has
2385 been captured, you can pass the third argument as NULL.
2386
2387 The pcre2_substring_copy_bynumber() function copies a captured sub‐
2388 string into a supplied buffer, whereas pcre2_substring_get_bynumber()
2389 copies it into new memory, obtained using the same memory allocation
2390 function that was used for the match data block. The first two argu‐
2391 ments of these functions are a pointer to the match data block and a
2392 capturing group number.
2393
2394 The final arguments of pcre2_substring_copy_bynumber() are a pointer to
2395 the buffer and a pointer to a variable that contains its length in code
2396 units. This is updated to contain the actual number of code units used
2397 for the extracted substring, excluding the terminating zero.
2398
2399 For pcre2_substring_get_bynumber() the third and fourth arguments point
2400 to variables that are updated with a pointer to the new memory and the
2401 number of code units that comprise the substring, again excluding the
2402 terminating zero. When the substring is no longer needed, the memory
2403 should be freed by calling pcre2_substring_free().
2404
2405 The return value from all these functions is zero for success, or a
2406 negative error code. If the pattern match failed, the match failure
2407 code is returned. If a substring number greater than zero is used
2408 after a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible
2409 error codes are:
2410
2411 PCRE2_ERROR_NOMEMORY
2412
2413 The buffer was too small for pcre2_substring_copy_bynumber(), or the
2414 attempt to get memory failed for pcre2_substring_get_bynumber().
2415
2416 PCRE2_ERROR_NOSUBSTRING
2417
2418 There is no substring with that number in the pattern, that is, the
2419 number is greater than the number of capturing parentheses.
2420
2421 PCRE2_ERROR_UNAVAILABLE
2422
2423 The substring number, though not greater than the number of captures in
2424 the pattern, is greater than the number of slots in the ovector, so the
2425 substring could not be captured.
2426
2427 PCRE2_ERROR_UNSET
2428
2429 The substring did not participate in the match. For example, if the
2430 pattern is (abc)|(def) and the subject is "def", and the ovector con‐
2431 tains at least two capturing slots, substring number 1 is unset.
2432
2434
2435 int pcre2_substring_list_get(pcre2_match_data *match_data,
2436 PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
2437
2438 void pcre2_substring_list_free(PCRE2_SPTR *list);
2439
2440 The pcre2_substring_list_get() function extracts all available sub‐
2441 strings and builds a list of pointers to them. It also (optionally)
2442 builds a second list that contains their lengths (in code units),
2443 excluding a terminating zero that is added to each of them. All this is
2444 done in a single block of memory that is obtained using the same memory
2445 allocation function that was used to get the match data block.
2446
2447 This function must be called only after a successful match. If called
2448 after a partial match, the error code PCRE2_ERROR_PARTIAL is returned.
2449
2450 The address of the memory block is returned via listptr, which is also
2451 the start of the list of string pointers. The end of the list is marked
2452 by a NULL pointer. The address of the list of lengths is returned via
2453 lengthsptr. If your strings do not contain binary zeros and you do not
2454 therefore need the lengths, you may supply NULL as the lengthsptr argu‐
2455 ment to disable the creation of a list of lengths. The yield of the
2456 function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the mem‐
2457 ory block could not be obtained. When the list is no longer needed, it
2458 should be freed by calling pcre2_substring_list_free().
2459
2460 If this function encounters a substring that is unset, which can happen
2461 when capturing subpattern number n+1 matches some part of the subject,
2462 but subpattern n has not been used at all, it returns an empty string.
2463 This can be distinguished from a genuine zero-length substring by
2464 inspecting the appropriate offset in the ovector, which contain
2465 PCRE2_UNSET for unset substrings, or by calling pcre2_sub‐
2466 string_length_bynumber().
2467
2469
2470 int pcre2_substring_number_from_name(const pcre2_code *code,
2471 PCRE2_SPTR name);
2472
2473 int pcre2_substring_length_byname(pcre2_match_data *match_data,
2474 PCRE2_SPTR name, PCRE2_SIZE *length);
2475
2476 int pcre2_substring_copy_byname(pcre2_match_data *match_data,
2477 PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
2478
2479 int pcre2_substring_get_byname(pcre2_match_data *match_data,
2480 PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
2481
2482 void pcre2_substring_free(PCRE2_UCHAR *buffer);
2483
2484 To extract a substring by name, you first have to find associated num‐
2485 ber. For example, for this pattern:
2486
2487 (a+)b(?<xxx>\d+)...
2488
2489 the number of the subpattern called "xxx" is 2. If the name is known to
2490 be unique (PCRE2_DUPNAMES was not set), you can find the number from
2491 the name by calling pcre2_substring_number_from_name(). The first argu‐
2492 ment is the compiled pattern, and the second is the name. The yield of
2493 the function is the subpattern number, PCRE2_ERROR_NOSUBSTRING if there
2494 is no subpattern of that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if
2495 there is more than one subpattern of that name. Given the number, you
2496 can extract the substring directly, or use one of the functions
2497 described above.
2498
2499 For convenience, there are also "byname" functions that correspond to
2500 the "bynumber" functions, the only difference being that the second
2501 argument is a name instead of a number. If PCRE2_DUPNAMES is set and
2502 there are duplicate names, these functions scan all the groups with the
2503 given name, and return the first named string that is set.
2504
2505 If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
2506 returned. If all groups with the name have numbers that are greater
2507 than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is
2508 returned. If there is at least one group with a slot in the ovector,
2509 but no group is found to be set, PCRE2_ERROR_UNSET is returned.
2510
2511 Warning: If the pattern uses the (?| feature to set up multiple subpat‐
2512 terns with the same number, as described in the section on duplicate
2513 subpattern numbers in the pcre2pattern page, you cannot use names to
2514 distinguish the different subpatterns, because names are not included
2515 in the compiled code. The matching process uses only numbers. For this
2516 reason, the use of different names for subpatterns of the same number
2517 causes an error at compile time.
2518
2520
2521 int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
2522 PCRE2_SIZE length, PCRE2_SIZE startoffset,
2523 uint32_t options, pcre2_match_data *match_data,
2524 pcre2_match_context *mcontext, PCRE2_SPTR replacement,
2525 PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferfP,
2526 PCRE2_SIZE *outlengthptr);
2527
2528 This function calls pcre2_match() and then makes a copy of the subject
2529 string in outputbuffer, replacing the part that was matched with the
2530 replacement string, whose length is supplied in rlength. This can be
2531 given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
2532 which a \K item in a lookahead in the pattern causes the match to end
2533 before it starts are not supported, and give rise to an error return.
2534
2535 The first seven arguments of pcre2_substitute() are the same as for
2536 pcre2_match(), except that the partial matching options are not permit‐
2537 ted, and match_data may be passed as NULL, in which case a match data
2538 block is obtained and freed within this function, using memory manage‐
2539 ment functions from the match context, if provided, or else those that
2540 were used to allocate memory for the compiled code.
2541
2542 The outlengthptr argument must point to a variable that contains the
2543 length, in code units, of the output buffer. If the function is suc‐
2544 cessful, the value is updated to contain the length of the new string,
2545 excluding the trailing zero that is automatically added.
2546
2547 If the function is not successful, the value set via outlengthptr
2548 depends on the type of error. For syntax errors in the replacement
2549 string, the value is the offset in the replacement string where the
2550 error was detected. For other errors, the value is PCRE2_UNSET by
2551 default. This includes the case of the output buffer being too small,
2552 unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set (see below), in which
2553 case the value is the minimum length needed, including space for the
2554 trailing zero. Note that in order to compute the required length,
2555 pcre2_substitute() has to simulate all the matching and copying,
2556 instead of giving an error return as soon as the buffer overflows. Note
2557 also that the length is in code units, not bytes.
2558
2559 In the replacement string, which is interpreted as a UTF string in UTF
2560 mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
2561 option is set, a dollar character is an escape character that can spec‐
2562 ify the insertion of characters from capturing groups or (*MARK) items
2563 in the pattern. The following forms are always recognized:
2564
2565 $$ insert a dollar character
2566 $<n> or ${<n>} insert the contents of group <n>
2567 $*MARK or ${*MARK} insert the name of the last (*MARK) encountered
2568
2569 Either a group number or a group name can be given for <n>. Curly
2570 brackets are required only if the following character would be inter‐
2571 preted as part of the number or name. The number may be zero to include
2572 the entire matched string. For example, if the pattern a(b)c is
2573 matched with "=abc=" and the replacement string "+$1$0$1+", the result
2574 is "=+babcb+=".
2575
2576 The facility for inserting a (*MARK) name can be used to perform simple
2577 simultaneous substitutions, as this pcre2test example shows:
2578
2579 /(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
2580 apple lemon
2581 2: pear orange
2582
2583 As well as the usual options for pcre2_match(), a number of additional
2584 options can be set in the options argument.
2585
2586 PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject
2587 string, replacing every matching substring. If this is not set, only
2588 the first matching substring is replaced. If any matched substring has
2589 zero length, after the substitution has happened, an attempt to find a
2590 non-empty match at the same position is performed. If this is not suc‐
2591 cessful, the current position is advanced by one character except when
2592 CRLF is a valid newline sequence and the next two characters are CR,
2593 LF. In this case, the current position is advanced by two characters.
2594
2595 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output
2596 buffer is too small. The default action is to return PCRE2_ERROR_NOMEM‐
2597 ORY immediately. If this option is set, however, pcre2_substitute()
2598 continues to go through the motions of matching and substituting (with‐
2599 out, of course, writing anything) in order to compute the size of buf‐
2600 fer that is needed. This value is passed back via the outlengthptr
2601 variable, with the result of the function still being
2602 PCRE2_ERROR_NOMEMORY.
2603
2604 Passing a buffer size of zero is a permitted way of finding out how
2605 much memory is needed for given substitution. However, this does mean
2606 that the entire operation is carried out twice. Depending on the appli‐
2607 cation, it may be more efficient to allocate a large buffer and free
2608 the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER‐
2609 FLOW_LENGTH.
2610
2611 PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups
2612 that do not appear in the pattern to be treated as unset groups. This
2613 option should be used with care, because it means that a typo in a
2614 group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING
2615 error.
2616
2617 PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including
2618 unknown groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be
2619 treated as empty strings when inserted as described above. If this
2620 option is not set, an attempt to insert an unset group causes the
2621 PCRE2_ERROR_UNSET error. This option does not influence the extended
2622 substitution syntax described below.
2623
2624 PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the
2625 replacement string. Without this option, only the dollar character is
2626 special, and only the group insertion forms listed above are valid.
2627 When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
2628
2629 Firstly, backslash in a replacement string is interpreted as an escape
2630 character. The usual forms such as \n or \x{ddd} can be used to specify
2631 particular character codes, and backslash followed by any non-alphanu‐
2632 meric character quotes that character. Extended quoting can be coded
2633 using \Q...\E, exactly as in pattern strings.
2634
2635 There are also four escape sequences for forcing the case of inserted
2636 letters. The insertion mechanism has three states: no case forcing,
2637 force upper case, and force lower case. The escape sequences change the
2638 current state: \U and \L change to upper or lower case forcing, respec‐
2639 tively, and \E (when not terminating a \Q quoted sequence) reverts to
2640 no case forcing. The sequences \u and \l force the next character (if
2641 it is a letter) to upper or lower case, respectively, and then the
2642 state automatically reverts to no case forcing. Case forcing applies to
2643 all inserted characters, including those from captured groups and let‐
2644 ters within \Q...\E quoted sequences.
2645
2646 Note that case forcing sequences such as \U...\E do not nest. For exam‐
2647 ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
2648 \E has no effect.
2649
2650 The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
2651 flexibility to group substitution. The syntax is similar to that used
2652 by Bash:
2653
2654 ${<n>:-<string>}
2655 ${<n>:+<string1>:<string2>}
2656
2657 As before, <n> may be a group number or a name. The first form speci‐
2658 fies a default value. If group <n> is set, its value is inserted; if
2659 not, <string> is expanded and the result inserted. The second form
2660 specifies strings that are expanded and inserted when group <n> is set
2661 or unset, respectively. The first form is just a convenient shorthand
2662 for
2663
2664 ${<n>:+${<n>}:<string>}
2665
2666 Backslash can be used to escape colons and closing curly brackets in
2667 the replacement strings. A change of the case forcing state within a
2668 replacement string remains in force afterwards, as shown in this
2669 pcre2test example:
2670
2671 /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
2672 body
2673 1: hello
2674 somebody
2675 1: HELLO
2676
2677 The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
2678 substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause
2679 unknown groups in the extended syntax forms to be treated as unset.
2680
2681 If successful, pcre2_substitute() returns the number of replacements
2682 that were made. This may be zero if no matches were found, and is never
2683 greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
2684
2685 In the event of an error, a negative error code is returned. Except for
2686 PCRE2_ERROR_NOMATCH (which is never returned), errors from
2687 pcre2_match() are passed straight back.
2688
2689 PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser‐
2690 tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
2691
2692 PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ‐
2693 ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set)
2694 when the simple (non-extended) syntax is used and PCRE2_SUBSTI‐
2695 TUTE_UNSET_EMPTY is not set.
2696
2697 PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big
2698 enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size
2699 of buffer that is needed is returned via outlengthptr. Note that this
2700 does not happen by default.
2701
2702 PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
2703 the replacement string, with more particular errors being
2704 PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP‐
2705 MISSING_BRACE (closing curly bracket not found), PCRE2_BADSUBSTITUTION
2706 (syntax error in extended group substitution), and PCRE2_BADSUBPATTERN
2707 (the pattern match ended before it started, which can happen if \K is
2708 used in an assertion).
2709
2710 As for all PCRE2 errors, a text message that describes the error can be
2711 obtained by calling the pcre2_get_error_message() function (see
2712 "Obtaining a textual error message" above).
2713
2715
2716 int pcre2_substring_nametable_scan(const pcre2_code *code,
2717 PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
2718
2719 When a pattern is compiled with the PCRE2_DUPNAMES option, names for
2720 subpatterns are not required to be unique. Duplicate names are always
2721 allowed for subpatterns with the same number, created by using the (?|
2722 feature. Indeed, if such subpatterns are named, they are required to
2723 use the same names.
2724
2725 Normally, patterns with duplicate names are such that in any one match,
2726 only one of the named subpatterns participates. An example is shown in
2727 the pcre2pattern documentation.
2728
2729 When duplicates are present, pcre2_substring_copy_byname() and
2730 pcre2_substring_get_byname() return the first substring corresponding
2731 to the given name that is set. Only if none are set is
2732 PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
2733 function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
2734 duplicate names.
2735
2736 If you want to get full details of all captured substrings for a given
2737 name, you must use the pcre2_substring_nametable_scan() function. The
2738 first argument is the compiled pattern, and the second is the name. If
2739 the third and fourth arguments are NULL, the function returns a group
2740 number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
2741
2742 When the third and fourth arguments are not NULL, they must be pointers
2743 to variables that are updated by the function. After it has run, they
2744 point to the first and last entries in the name-to-number table for the
2745 given name, and the function returns the length of each entry in code
2746 units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
2747 no entries for the given name.
2748
2749 The format of the name table is described above in the section entitled
2750 Information about a pattern. Given all the relevant entries for the
2751 name, you can extract each of their numbers, and hence the captured
2752 data.
2753
2755
2756 The traditional matching function uses a similar algorithm to Perl,
2757 which stops when it finds the first match at a given point in the sub‐
2758 ject. If you want to find all possible matches, or the longest possible
2759 match at a given position, consider using the alternative matching
2760 function (see below) instead. If you cannot use the alternative func‐
2761 tion, you can kludge it up by making use of the callout facility, which
2762 is described in the pcre2callout documentation.
2763
2764 What you have to do is to insert a callout right at the end of the pat‐
2765 tern. When your callout function is called, extract and save the cur‐
2766 rent matched substring. Then return 1, which forces pcre2_match() to
2767 backtrack and try other alternatives. Ultimately, when it runs out of
2768 matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
2769
2771
2772 int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
2773 PCRE2_SIZE length, PCRE2_SIZE startoffset,
2774 uint32_t options, pcre2_match_data *match_data,
2775 pcre2_match_context *mcontext,
2776 int *workspace, PCRE2_SIZE wscount);
2777
2778 The function pcre2_dfa_match() is called to match a subject string
2779 against a compiled pattern, using a matching algorithm that scans the
2780 subject string just once, and does not backtrack. This has different
2781 characteristics to the normal algorithm, and is not compatible with
2782 Perl. Some of the features of PCRE2 patterns are not supported. Never‐
2783 theless, there are times when this kind of matching can be useful. For
2784 a discussion of the two matching algorithms, and a list of features
2785 that pcre2_dfa_match() does not support, see the pcre2matching documen‐
2786 tation.
2787
2788 The arguments for the pcre2_dfa_match() function are the same as for
2789 pcre2_match(), plus two extras. The ovector within the match data block
2790 is used in a different way, and this is described below. The other com‐
2791 mon arguments are used in the same way as for pcre2_match(), so their
2792 description is not repeated here.
2793
2794 The two additional arguments provide workspace for the function. The
2795 workspace vector should contain at least 20 elements. It is used for
2796 keeping track of multiple paths through the pattern tree. More
2797 workspace is needed for patterns and subjects where there are a lot of
2798 potential matches.
2799
2800 Here is an example of a simple call to pcre2_dfa_match():
2801
2802 int wspace[20];
2803 pcre2_match_data *md = pcre2_match_data_create(4, NULL);
2804 int rc = pcre2_dfa_match(
2805 re, /* result of pcre2_compile() */
2806 "some string", /* the subject string */
2807 11, /* the length of the subject string */
2808 0, /* start at offset 0 in the subject */
2809 0, /* default options */
2810 match_data, /* the match data block */
2811 NULL, /* a match context; NULL means use defaults */
2812 wspace, /* working space vector */
2813 20); /* number of elements (NOT size in bytes) */
2814
2815 Option bits for pcre_dfa_match()
2816
2817 The unused bits of the options argument for pcre2_dfa_match() must be
2818 zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
2819 PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
2820 PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
2821 PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of
2822 these are exactly the same as for pcre2_match(), so their description
2823 is not repeated here.
2824
2825 PCRE2_PARTIAL_HARD
2826 PCRE2_PARTIAL_SOFT
2827
2828 These have the same general effect as they do for pcre2_match(), but
2829 the details are slightly different. When PCRE2_PARTIAL_HARD is set for
2830 pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
2831 subject is reached and there is still at least one matching possibility
2832 that requires additional characters. This happens even if some complete
2833 matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
2834 return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
2835 if the end of the subject is reached, there have been no complete
2836 matches, but there is still at least one matching possibility. The por‐
2837 tion of the string that was inspected when the longest partial match
2838 was found is set as the first matching string in both cases. There is a
2839 more detailed discussion of partial and multi-segment matching, with
2840 examples, in the pcre2partial documentation.
2841
2842 PCRE2_DFA_SHORTEST
2843
2844 Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
2845 stop as soon as it has found one match. Because of the way the alterna‐
2846 tive algorithm works, this is necessarily the shortest possible match
2847 at the first possible matching point in the subject string.
2848
2849 PCRE2_DFA_RESTART
2850
2851 When pcre2_dfa_match() returns a partial match, it is possible to call
2852 it again, with additional subject characters, and have it continue with
2853 the same match. The PCRE2_DFA_RESTART option requests this action; when
2854 it is set, the workspace and wscount options must reference the same
2855 vector as before because data about the match so far is left in them
2856 after a partial match. There is more discussion of this facility in the
2857 pcre2partial documentation.
2858
2859 Successful returns from pcre2_dfa_match()
2860
2861 When pcre2_dfa_match() succeeds, it may have matched more than one sub‐
2862 string in the subject. Note, however, that all the matches from one run
2863 of the function start at the same point in the subject. The shorter
2864 matches are all initial substrings of the longer matches. For example,
2865 if the pattern
2866
2867 <.*>
2868
2869 is matched against the string
2870
2871 This is <something> <something else> <something further> no more
2872
2873 the three matched strings are
2874
2875 <something> <something else> <something further>
2876 <something> <something else>
2877 <something>
2878
2879 On success, the yield of the function is a number greater than zero,
2880 which is the number of matched substrings. The offsets of the sub‐
2881 strings are returned in the ovector, and can be extracted by number in
2882 the same way as for pcre2_match(), but the numbers bear no relation to
2883 any capturing groups that may exist in the pattern, because DFA match‐
2884 ing does not support group capture.
2885
2886 Calls to the convenience functions that extract substrings by name
2887 return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
2888 after a DFA match. The convenience functions that extract substrings by
2889 number never return PCRE2_ERROR_NOSUBSTRING, and the meanings of some
2890 other errors are slightly different:
2891
2892 PCRE2_ERROR_UNAVAILABLE
2893
2894 The ovector is not big enough to include a slot for the given substring
2895 number.
2896
2897 PCRE2_ERROR_UNSET
2898
2899 There is a slot in the ovector for this substring, but there were
2900 insufficient matches to fill it.
2901
2902 The matched strings are stored in the ovector in reverse order of
2903 length; that is, the longest matching string is first. If there were
2904 too many matches to fit into the ovector, the yield of the function is
2905 zero, and the vector is filled with the longest matches.
2906
2907 NOTE: PCRE2's "auto-possessification" optimization usually applies to
2908 character repeats at the end of a pattern (as well as internally). For
2909 example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
2910 matching, this means that only one possible match is found. If you
2911 really do want multiple matches in such cases, either use an ungreedy
2912 repeat auch as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
2913 compiling.
2914
2915 Error returns from pcre2_dfa_match()
2916
2917 The pcre2_dfa_match() function returns a negative number when it fails.
2918 Many of the errors are the same as for pcre2_match(), as described
2919 above. There are in addition the following errors that are specific to
2920 pcre2_dfa_match():
2921
2922 PCRE2_ERROR_DFA_UITEM
2923
2924 This return is given if pcre2_dfa_match() encounters an item in the
2925 pattern that it does not support, for instance, the use of \C in a UTF
2926 mode or a back reference.
2927
2928 PCRE2_ERROR_DFA_UCOND
2929
2930 This return is given if pcre2_dfa_match() encounters a condition item
2931 that uses a back reference for the condition, or a test for recursion
2932 in a specific group. These are not supported.
2933
2934 PCRE2_ERROR_DFA_WSSIZE
2935
2936 This return is given if pcre2_dfa_match() runs out of space in the
2937 workspace vector.
2938
2939 PCRE2_ERROR_DFA_RECURSE
2940
2941 When a recursive subpattern is processed, the matching function calls
2942 itself recursively, using private memory for the ovector and workspace.
2943 This error is given if the internal ovector is not large enough. This
2944 should be extremely rare, as a vector of size 1000 is used.
2945
2946 PCRE2_ERROR_DFA_BADRESTART
2947
2948 When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
2949 some plausibility checks are made on the contents of the workspace,
2950 which should contain data about the previous partial match. If any of
2951 these checks fail, this error is given.
2952
2954
2955 pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
2956 pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2stack(3),
2957 pcre2unicode(3).
2958
2960
2961 Philip Hazel
2962 University Computing Service
2963 Cambridge, England.
2964
2966
2967 Last updated: 23 December 2016
2968 Copyright (c) 1997-2016 University of Cambridge.
2969
2970
2971
2972PCRE2 10.23 24 December 2016 PCRE2API(3)