1PCRE2API(3)                Library Functions Manual                PCRE2API(3)
2
3
4

NAME

6       PCRE2 - Perl-compatible regular expressions (revised API)
7
8       #include <pcre2.h>
9
10       PCRE2  is  a  new API for PCRE. This document contains a description of
11       all its functions. See the pcre2 document for an overview  of  all  the
12       PCRE2 documentation.
13

PCRE2 NATIVE API BASIC FUNCTIONS

15
16       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
17         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
18         pcre2_compile_context *ccontext);
19
20       void pcre2_code_free(pcre2_code *code);
21
22       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
23         pcre2_general_context *gcontext);
24
25       pcre2_match_data *pcre2_match_data_create_from_pattern(
26         const pcre2_code *code, pcre2_general_context *gcontext);
27
28       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
29         PCRE2_SIZE length, PCRE2_SIZE startoffset,
30         uint32_t options, pcre2_match_data *match_data,
31         pcre2_match_context *mcontext);
32
33       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
34         PCRE2_SIZE length, PCRE2_SIZE startoffset,
35         uint32_t options, pcre2_match_data *match_data,
36         pcre2_match_context *mcontext,
37         int *workspace, PCRE2_SIZE wscount);
38
39       void pcre2_match_data_free(pcre2_match_data *match_data);
40

PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS

42
43       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
44
45       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
46
47       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
48
49       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
50

PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS

52
53       pcre2_general_context *pcre2_general_context_create(
54         void *(*private_malloc)(PCRE2_SIZE, void *),
55         void (*private_free)(void *, void *), void *memory_data);
56
57       pcre2_general_context *pcre2_general_context_copy(
58         pcre2_general_context *gcontext);
59
60       void pcre2_general_context_free(pcre2_general_context *gcontext);
61

PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS

63
64       pcre2_compile_context *pcre2_compile_context_create(
65         pcre2_general_context *gcontext);
66
67       pcre2_compile_context *pcre2_compile_context_copy(
68         pcre2_compile_context *ccontext);
69
70       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
71
72       int pcre2_set_bsr(pcre2_compile_context *ccontext,
73         uint32_t value);
74
75       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
76         const unsigned char *tables);
77
78       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
79         PCRE2_SIZE value);
80
81       int pcre2_set_newline(pcre2_compile_context *ccontext,
82         uint32_t value);
83
84       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
85         uint32_t value);
86
87       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
88         int (*guard_function)(uint32_t, void *), void *user_data);
89

PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

91
92       pcre2_match_context *pcre2_match_context_create(
93         pcre2_general_context *gcontext);
94
95       pcre2_match_context *pcre2_match_context_copy(
96         pcre2_match_context *mcontext);
97
98       void pcre2_match_context_free(pcre2_match_context *mcontext);
99
100       int pcre2_set_callout(pcre2_match_context *mcontext,
101         int (*callout_function)(pcre2_callout_block *, void *),
102         void *callout_data);
103
104       int pcre2_set_match_limit(pcre2_match_context *mcontext,
105         uint32_t value);
106
107       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
108         PCRE2_SIZE value);
109
110       int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
111         uint32_t value);
112
113       int pcre2_set_recursion_memory_management(
114         pcre2_match_context *mcontext,
115         void *(*private_malloc)(PCRE2_SIZE, void *),
116         void (*private_free)(void *, void *), void *memory_data);
117

PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS

119
120       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
121         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
122
123       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
124         uint32_t number, PCRE2_UCHAR *buffer,
125         PCRE2_SIZE *bufflen);
126
127       void pcre2_substring_free(PCRE2_UCHAR *buffer);
128
129       int pcre2_substring_get_byname(pcre2_match_data *match_data,
130         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
131
132       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
133         uint32_t number, PCRE2_UCHAR **bufferptr,
134         PCRE2_SIZE *bufflen);
135
136       int pcre2_substring_length_byname(pcre2_match_data *match_data,
137         PCRE2_SPTR name, PCRE2_SIZE *length);
138
139       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
140         uint32_t number, PCRE2_SIZE *length);
141
142       int pcre2_substring_nametable_scan(const pcre2_code *code,
143         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
144
145       int pcre2_substring_number_from_name(const pcre2_code *code,
146         PCRE2_SPTR name);
147
148       void pcre2_substring_list_free(PCRE2_SPTR *list);
149
150       int pcre2_substring_list_get(pcre2_match_data *match_data,
151         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
152

PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION

154
155       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
156         PCRE2_SIZE length, PCRE2_SIZE startoffset,
157         uint32_t options, pcre2_match_data *match_data,
158         pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
159         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
160         PCRE2_SIZE *outlengthptr);
161

PCRE2 NATIVE API JIT FUNCTIONS

163
164       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
165
166       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
167         PCRE2_SIZE length, PCRE2_SIZE startoffset,
168         uint32_t options, pcre2_match_data *match_data,
169         pcre2_match_context *mcontext);
170
171       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
172
173       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
174         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
175
176       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
177         pcre2_jit_callback callback_function, void *callback_data);
178
179       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
180

PCRE2 NATIVE API SERIALIZATION FUNCTIONS

182
183       int32_t pcre2_serialize_decode(pcre2_code **codes,
184         int32_t number_of_codes, const uint8_t *bytes,
185         pcre2_general_context *gcontext);
186
187       int32_t pcre2_serialize_encode(const pcre2_code **codes,
188         int32_t number_of_codes, uint8_t **serialized_bytes,
189         PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
190
191       void pcre2_serialize_free(uint8_t *bytes);
192
193       int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
194

PCRE2 NATIVE API AUXILIARY FUNCTIONS

196
197       pcre2_code *pcre2_code_copy(const pcre2_code *code);
198
199       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
200
201       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
202         PCRE2_SIZE bufflen);
203
204       const unsigned char *pcre2_maketables(pcre2_general_context *gcontext);
205
206       int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
207
208       int pcre2_callout_enumerate(const pcre2_code *code,
209         int (*callback)(pcre2_callout_enumerate_block *, void *),
210         void *user_data);
211
212       int pcre2_config(uint32_t what, void *where);
213

PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES

215
216       There  are  three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit
217       code units, respectively. However,  there  is  just  one  header  file,
218       pcre2.h.   This  contains the function prototypes and other definitions
219       for all three libraries. One, two, or all three can be installed simul‐
220       taneously.  On  Unix-like  systems the libraries are called libpcre2-8,
221       libpcre2-16, and libpcre2-32, and they can also co-exist with the orig‐
222       inal PCRE libraries.
223
224       Character  strings are passed to and from a PCRE2 library as a sequence
225       of unsigned integers in code units  of  the  appropriate  width.  Every
226       PCRE2  function  comes  in three different forms, one for each library,
227       for example:
228
229         pcre2_compile_8()
230         pcre2_compile_16()
231         pcre2_compile_32()
232
233       There are also three different sets of data types:
234
235         PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
236         PCRE2_SPTR8,  PCRE2_SPTR16,  PCRE2_SPTR32
237
238       The UCHAR types define unsigned code units of the  appropriate  widths.
239       For  example,  PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR
240       types are constant pointers to the equivalent  UCHAR  types,  that  is,
241       they are pointers to vectors of unsigned code units.
242
243       Many  applications use only one code unit width. For their convenience,
244       macros are defined whose names are the generic forms such as pcre2_com‐
245       pile()  and  PCRE2_SPTR.  These  macros  use  the  value  of  the macro
246       PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific  func‐
247       tion and macro names.  PCRE2_CODE_UNIT_WIDTH is not defined by default.
248       An application must define it to be  8,  16,  or  32  before  including
249       pcre2.h in order to make use of the generic names.
250
251       Applications  that use more than one code unit width can be linked with
252       more than one PCRE2 library, but must define  PCRE2_CODE_UNIT_WIDTH  to
253       be  0  before  including pcre2.h, and then use the real function names.
254       Any code that is to be included in an environment where  the  value  of
255       PCRE2_CODE_UNIT_WIDTH  is  unknown  should  also  use the real function
256       names. (Unfortunately, it is not possible in C code to save and restore
257       the value of a macro.)
258
259       If  PCRE2_CODE_UNIT_WIDTH  is  not  defined before including pcre2.h, a
260       compiler error occurs.
261
262       When using multiple libraries in an application,  you  must  take  care
263       when  processing  any  particular  pattern to use only functions from a
264       single library.  For example, if you want to run a match using  a  pat‐
265       tern  that  was  compiled  with pcre2_compile_16(), you must do so with
266       pcre2_match_16(), not pcre2_match_8().
267
268       In the function summaries above, and in the rest of this  document  and
269       other  PCRE2  documents,  functions  and data types are described using
270       their generic names, without the 8, 16, or 32 suffix.
271

PCRE2 API OVERVIEW

273
274       PCRE2 has its own native API, which  is  described  in  this  document.
275       There are also some wrapper functions for the 8-bit library that corre‐
276       spond to the POSIX regular expression API, but they do not give  access
277       to all the functionality. They are described in the pcre2posix documen‐
278       tation. Both these APIs define a set of C function calls.
279
280       The native API C data types, function prototypes,  option  values,  and
281       error codes are defined in the header file pcre2.h, which contains def‐
282       initions of PCRE2_MAJOR and PCRE2_MINOR, the major  and  minor  release
283       numbers  for the library. Applications can use these to include support
284       for different releases of PCRE2.
285
286       In a Windows environment, if you want to statically link an application
287       program  against  a non-dll PCRE2 library, you must define PCRE2_STATIC
288       before including pcre2.h.
289
290       The functions pcre2_compile(), and pcre2_match() are used for compiling
291       and  matching regular expressions in a Perl-compatible manner. A sample
292       program that demonstrates the simplest way of using them is provided in
293       the file called pcre2demo.c in the PCRE2 source distribution. A listing
294       of this program is  given  in  the  pcre2demo  documentation,  and  the
295       pcre2sample documentation describes how to compile and run it.
296
297       Just-in-time  compiler support is an optional feature of PCRE2 that can
298       be built in appropriate hardware environments. It greatly speeds up the
299       matching  performance of many patterns. Programs can request that it be
300       used if available, by calling pcre2_jit_compile() after a  pattern  has
301       been successfully compiled by pcre2_compile(). This does nothing if JIT
302       support is not available.
303
304       More complicated programs might need to  make  use  of  the  specialist
305       functions    pcre2_jit_stack_create(),    pcre2_jit_stack_free(),   and
306       pcre2_jit_stack_assign() in order to  control  the  JIT  code's  memory
307       usage.
308
309       JIT matching is automatically used by pcre2_match() if it is available,
310       unless the PCRE2_NO_JIT option is set. There is also a direct interface
311       for  JIT  matching,  which gives improved performance. The JIT-specific
312       functions are discussed in the pcre2jit documentation.
313
314       A second matching function, pcre2_dfa_match(), which is  not  Perl-com‐
315       patible,  is  also  provided.  This  uses a different algorithm for the
316       matching. The alternative algorithm finds all possible  matches  (at  a
317       given  point  in  the subject), and scans the subject just once (unless
318       there are lookbehind assertions).  However,  this  algorithm  does  not
319       return  captured  substrings.  A  description of the two matching algo‐
320       rithms  and  their  advantages  and  disadvantages  is  given  in   the
321       pcre2matching    documentation.   There   is   no   JIT   support   for
322       pcre2_dfa_match().
323
324       In addition to the main compiling and  matching  functions,  there  are
325       convenience functions for extracting captured substrings from a subject
326       string that has been matched by pcre2_match(). They are:
327
328         pcre2_substring_copy_byname()
329         pcre2_substring_copy_bynumber()
330         pcre2_substring_get_byname()
331         pcre2_substring_get_bynumber()
332         pcre2_substring_list_get()
333         pcre2_substring_length_byname()
334         pcre2_substring_length_bynumber()
335         pcre2_substring_nametable_scan()
336         pcre2_substring_number_from_name()
337
338       pcre2_substring_free() and pcre2_substring_list_free()  are  also  pro‐
339       vided, to free the memory used for extracted strings.
340
341       The  function  pcre2_substitute()  can be called to match a pattern and
342       return a copy of the subject string with substitutions for  parts  that
343       were matched.
344
345       Functions  whose  names begin with pcre2_serialize_ are used for saving
346       compiled patterns on disc or elsewhere, and reloading them later.
347
348       Finally, there are functions for finding out information about  a  com‐
349       piled  pattern  (pcre2_pattern_info()) and about the configuration with
350       which PCRE2 was built (pcre2_config()).
351
352       Functions with names ending with _free() are used  for  freeing  memory
353       blocks  of  various  sorts.  In all cases, if one of these functions is
354       called with a NULL argument, it does nothing.
355

STRING LENGTHS AND OFFSETS

357
358       The PCRE2 API uses string lengths and  offsets  into  strings  of  code
359       units  in  several  places. These values are always of type PCRE2_SIZE,
360       which is an unsigned integer type, currently always defined as  size_t.
361       The  largest  value  that  can  be  stored  in  such  a  type  (that is
362       ~(PCRE2_SIZE)0) is reserved as a special indicator for  zero-terminated
363       strings  and  unset offsets.  Therefore, the longest string that can be
364       handled is one less than this maximum.
365

NEWLINES

367
368       PCRE2 supports five different conventions for indicating line breaks in
369       strings:  a  single  CR (carriage return) character, a single LF (line‐
370       feed) character, the two-character sequence CRLF, any of the three pre‐
371       ceding,  or any Unicode newline sequence. The Unicode newline sequences
372       are the three just mentioned, plus the single characters  VT  (vertical
373       tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line
374       separator, U+2028), and PS (paragraph separator, U+2029).
375
376       Each of the first three conventions is used by at least  one  operating
377       system as its standard newline sequence. When PCRE2 is built, a default
378       can be specified.  The default default is LF, which is the  Unix  stan‐
379       dard.  However, the newline convention can be changed by an application
380       when calling pcre2_compile(), or it can be specified by special text at
381       the start of the pattern itself; this overrides any other settings. See
382       the pcre2pattern page for details of the special character sequences.
383
384       In the PCRE2 documentation the word "newline"  is  used  to  mean  "the
385       character or pair of characters that indicate a line break". The choice
386       of newline convention affects the handling of the dot, circumflex,  and
387       dollar metacharacters, the handling of #-comments in /x mode, and, when
388       CRLF is a recognized line ending sequence, the match position  advance‐
389       ment for a non-anchored pattern. There is more detail about this in the
390       section on pcre2_match() options below.
391
392       The choice of newline convention does not affect the interpretation  of
393       the \n or \r escape sequences, nor does it affect what \R matches; this
394       has its own separate convention.
395

MULTITHREADING

397
398       In a multithreaded application it is important to keep  thread-specific
399       data  separate  from data that can be shared between threads. The PCRE2
400       library code itself is thread-safe: it contains  no  static  or  global
401       variables.  The  API  is  designed to be fairly simple for non-threaded
402       applications while at the same time ensuring that multithreaded  appli‐
403       cations can use it.
404
405       There are several different blocks of data that are used to pass infor‐
406       mation between the application and the PCRE2 libraries.
407
408   The compiled pattern
409
410       A pointer to the compiled form of a pattern is  returned  to  the  user
411       when pcre2_compile() is successful. The data in the compiled pattern is
412       fixed, and does not change when the pattern is matched.  Therefore,  it
413       is  thread-safe, that is, the same compiled pattern can be used by more
414       than one thread simultaneously. For example, an application can compile
415       all its patterns at the start, before forking off multiple threads that
416       use them. However, if the just-in-time optimization  feature  is  being
417       used,  it  needs  separate  memory stack areas for each thread. See the
418       pcre2jit documentation for more details.
419
420       In a more complicated situation, where patterns are compiled only  when
421       they  are  first needed, but are still shared between threads, pointers
422       to compiled patterns must be protected  from  simultaneous  writing  by
423       multiple threads, at least until a pattern has been compiled. The logic
424       can be something like this:
425
426         Get a read-only (shared) lock (mutex) for pointer
427         if (pointer == NULL)
428           {
429           Get a write (unique) lock for pointer
430           pointer = pcre2_compile(...
431           }
432         Release the lock
433         Use pointer in pcre2_match()
434
435       Of course, testing for compilation errors should also  be  included  in
436       the code.
437
438       If JIT is being used, but the JIT compilation is not being done immedi‐
439       ately, (perhaps waiting to see if the pattern  is  used  often  enough)
440       similar logic is required. JIT compilation updates a pointer within the
441       compiled code block, so a thread must gain unique write access  to  the
442       pointer     before    calling    pcre2_jit_compile().    Alternatively,
443       pcre2_code_copy()  or  pcre2_code_copy_with_tables()  can  be  used  to
444       obtain a private copy of the compiled code.
445
446   Context blocks
447
448       The  next main section below introduces the idea of "contexts" in which
449       PCRE2 functions are called. A context is nothing more than a collection
450       of parameters that control the way PCRE2 operates. Grouping a number of
451       parameters together in a context is a convenient way of passing them to
452       a  PCRE2  function without using lots of arguments. The parameters that
453       are stored in contexts are in some sense  "advanced  features"  of  the
454       API. Many straightforward applications will not need to use contexts.
455
456       In a multithreaded application, if the parameters in a context are val‐
457       ues that are never changed, the same context can be  used  by  all  the
458       threads. However, if any thread needs to change any value in a context,
459       it must make its own thread-specific copy.
460
461   Match blocks
462
463       The matching functions need a block of memory for working space and for
464       storing  the  results  of  a  match.  This includes details of what was
465       matched, as well as additional  information  such  as  the  name  of  a
466       (*MARK) setting. Each thread must provide its own copy of this memory.
467

PCRE2 CONTEXTS

469
470       Some  PCRE2  functions have a lot of parameters, many of which are used
471       only by specialist applications, for example,  those  that  use  custom
472       memory  management  or  non-standard character tables. To keep function
473       argument lists at a reasonable size, and at the same time to  keep  the
474       API  extensible,  "uncommon" parameters are passed to certain functions
475       in a context instead of directly. A context is just a block  of  memory
476       that  holds  the  parameter  values.   Applications that do not need to
477       adjust any of the context parameters  can  pass  NULL  when  a  context
478       pointer is required.
479
480       There  are  three different types of context: a general context that is
481       relevant for several PCRE2 operations, a compile-time  context,  and  a
482       match-time context.
483
484   The general context
485
486       At  present,  this  context  just  contains  pointers to (and data for)
487       external memory management  functions  that  are  called  from  several
488       places in the PCRE2 library. The context is named `general' rather than
489       specifically `memory' because in future other fields may be  added.  If
490       you  do not want to supply your own custom memory management functions,
491       you do not need to bother with a general context. A general context  is
492       created by:
493
494       pcre2_general_context *pcre2_general_context_create(
495         void *(*private_malloc)(PCRE2_SIZE, void *),
496         void (*private_free)(void *, void *), void *memory_data);
497
498       The  two  function pointers specify custom memory management functions,
499       whose prototypes are:
500
501         void *private_malloc(PCRE2_SIZE, void *);
502         void  private_free(void *, void *);
503
504       Whenever code in PCRE2 calls these functions, the final argument is the
505       value of memory_data. Either of the first two arguments of the creation
506       function may be NULL, in which case the system memory management  func‐
507       tions  malloc()  and free() are used. (This is not currently useful, as
508       there are no other fields in a general context,  but  in  future  there
509       might  be.)   The  private_malloc()  function  is used (if supplied) to
510       obtain memory for storing the context, and all three values  are  saved
511       as part of the context.
512
513       Whenever  PCRE2  creates a data block of any kind, the block contains a
514       pointer to the free() function that matches the malloc() function  that
515       was  used.  When  the  time  comes  to free the block, this function is
516       called.
517
518       A general context can be copied by calling:
519
520       pcre2_general_context *pcre2_general_context_copy(
521         pcre2_general_context *gcontext);
522
523       The memory used for a general context should be freed by calling:
524
525       void pcre2_general_context_free(pcre2_general_context *gcontext);
526
527
528   The compile context
529
530       A compile context is required if you want to change the default  values
531       of any of the following compile-time parameters:
532
533         What \R matches (Unicode newlines or CR, LF, CRLF only)
534         PCRE2's character tables
535         The newline character sequence
536         The compile time nested parentheses limit
537         The maximum length of the pattern string
538         An external function for stack checking
539
540       A  compile context is also required if you are using custom memory man‐
541       agement.  If none of these apply, just pass NULL as the  context  argu‐
542       ment of pcre2_compile().
543
544       A  compile context is created, copied, and freed by the following func‐
545       tions:
546
547       pcre2_compile_context *pcre2_compile_context_create(
548         pcre2_general_context *gcontext);
549
550       pcre2_compile_context *pcre2_compile_context_copy(
551         pcre2_compile_context *ccontext);
552
553       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
554
555       A compile context is created with default values  for  its  parameters.
556       These can be changed by calling the following functions, which return 0
557       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
558
559       int pcre2_set_bsr(pcre2_compile_context *ccontext,
560         uint32_t value);
561
562       The value must be PCRE2_BSR_ANYCRLF, to specify that  \R  matches  only
563       CR,  LF,  or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any
564       Unicode line ending sequence. The value is used by the JIT compiler and
565       by   the   two   interpreted   matching  functions,  pcre2_match()  and
566       pcre2_dfa_match().
567
568       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
569         const unsigned char *tables);
570
571       The value must be the result of a  call  to  pcre2_maketables(),  whose
572       only argument is a general context. This function builds a set of char‐
573       acter tables in the current locale.
574
575       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
576         PCRE2_SIZE value);
577
578       This sets a maximum length, in code units, for the pattern string  that
579       is  to  be  compiled.  If the pattern is longer, an error is generated.
580       This facility is provided so that  applications  that  accept  patterns
581       from  external sources can limit their size. The default is the largest
582       number that a PCRE2_SIZE variable can hold, which is effectively unlim‐
583       ited.
584
585       int pcre2_set_newline(pcre2_compile_context *ccontext,
586         uint32_t value);
587
588       This specifies which characters or character sequences are to be recog‐
589       nized as newlines. The value must be one of PCRE2_NEWLINE_CR  (carriage
590       return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
591       two-character sequence CR followed by LF),  PCRE2_NEWLINE_ANYCRLF  (any
592       of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence).
593
594       When a pattern is compiled with the PCRE2_EXTENDED option, the value of
595       this parameter affects the recognition of white space and  the  end  of
596       internal comments starting with #. The value is saved with the compiled
597       pattern for subsequent use by the JIT compiler and by  the  two  inter‐
598       preted matching functions, pcre2_match() and pcre2_dfa_match().
599
600       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
601         uint32_t value);
602
603       This parameter ajusts the limit, set when PCRE2 is built (default 250),
604       on the depth of parenthesis nesting in  a  pattern.  This  limit  stops
605       rogue  patterns using up too much system stack when being compiled. The
606       limit applies to parentheses of all kinds, not just capturing parenthe‐
607       ses.
608
609       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
610         int (*guard_function)(uint32_t, void *), void *user_data);
611
612       There  is at least one application that runs PCRE2 in threads with very
613       limited system stack, where running out of stack is to  be  avoided  at
614       all  costs. The parenthesis limit above cannot take account of how much
615       stack is actually available. For a finer  control,  you  can  supply  a
616       function  that  is  called whenever pcre2_compile() starts to compile a
617       parenthesized part of a pattern. This function  can  check  the  actual
618       stack size (or anything else that it wants to, of course).
619
620       The  first  argument to the callout function gives the current depth of
621       nesting, and the second is user data that is set up by the  last  argu‐
622       ment   of  pcre2_set_compile_recursion_guard().  The  callout  function
623       should return zero if all is well, or non-zero to force an error.
624
625   The match context
626
627       A match context is required if you want to change the default values of
628       any of the following match-time parameters:
629
630         A callout function
631         The offset limit for matching an unanchored pattern
632         The limit for calling match() (see below)
633         The limit for calling match() recursively
634
635       A match context is also required if you are using custom memory manage‐
636       ment.  If none of these apply, just pass NULL as the  context  argument
637       of pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().
638
639       A  match  context  is created, copied, and freed by the following func‐
640       tions:
641
642       pcre2_match_context *pcre2_match_context_create(
643         pcre2_general_context *gcontext);
644
645       pcre2_match_context *pcre2_match_context_copy(
646         pcre2_match_context *mcontext);
647
648       void pcre2_match_context_free(pcre2_match_context *mcontext);
649
650       A match context is created with  default  values  for  its  parameters.
651       These can be changed by calling the following functions, which return 0
652       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
653
654       int pcre2_set_callout(pcre2_match_context *mcontext,
655         int (*callout_function)(pcre2_callout_block *, void *),
656         void *callout_data);
657
658       This sets up a "callout" function, which PCRE2 will call  at  specified
659       points during a matching operation. Details are given in the pcre2call‐
660       out documentation.
661
662       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
663         PCRE2_SIZE value);
664
665       The offset_limit parameter limits how  far  an  unanchored  search  can
666       advance  in  the  subject string. The default value is PCRE2_UNSET. The
667       pcre2_match()     and      pcre2_dfa_match()      functions      return
668       PCRE2_ERROR_NOMATCH  if  a match with a starting point before or at the
669       given offset is not found. For example, if the pattern /abc/ is matched
670       against  "123abc"  with  an  offset  limit  less  than 3, the result is
671       PCRE2_ERROR_NO_MATCH.  A match can never be found  if  the  startoffset
672       argument of pcre2_match() or pcre2_dfa_match() is greater than the off‐
673       set limit.
674
675       When using this facility,  you  must  set  PCRE2_USE_OFFSET_LIMIT  when
676       calling  pcre2_compile() so that when JIT is in use, different code can
677       be compiled. If a match is started with a non-default match limit  when
678       PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
679
680       The  offset limit facility can be used to track progress when searching
681       large subject strings.  See  also  the  PCRE2_FIRSTLINE  option,  which
682       requires a match to start within the first line of the subject. If this
683       is set with an offset limit, a match must occur in the first  line  and
684       also  within  the  offset limit.  In other words, whichever limit comes
685       first is used.
686
687       int pcre2_set_match_limit(pcre2_match_context *mcontext,
688         uint32_t value);
689
690       The match_limit parameter provides a means  of  preventing  PCRE2  from
691       using up too many resources when processing patterns that are not going
692       to match, but which have a very large number of possibilities in  their
693       search  trees. The classic example is a pattern that uses nested unlim‐
694       ited repeats.
695
696       Internally, pcre2_match() uses a  function  called  match(),  which  it
697       calls  repeatedly (sometimes recursively). The limit set by match_limit
698       is imposed on the number of times this  function  is  called  during  a
699       match, which has the effect of limiting the amount of backtracking that
700       can take place. For patterns that are not anchored, the count  restarts
701       from  zero  for  each position in the subject string. This limit is not
702       relevant to pcre2_dfa_match(), which ignores it.
703
704       When pcre2_match() is called with a pattern that was successfully  pro‐
705       cessed by pcre2_jit_compile(), the way in which matching is executed is
706       entirely different. However, there is still the possibility of  runaway
707       matching  that  goes  on  for  a very long time, and so the match_limit
708       value is also used in this case (but in a different way) to  limit  how
709       long the matching can continue.
710
711       The  default  value  for  the limit can be set when PCRE2 is built; the
712       default default is 10 million, which handles all but the  most  extreme
713       cases.    If    the    limit   is   exceeded,   pcre2_match()   returns
714       PCRE2_ERROR_MATCHLIMIT. A value for the match limit may  also  be  sup‐
715       plied by an item at the start of a pattern of the form
716
717         (*LIMIT_MATCH=ddd)
718
719       where  ddd  is  a  decimal  number.  However, such a setting is ignored
720       unless ddd is less than the limit set by the  caller  of  pcre2_match()
721       or, if no such limit is set, less than the default.
722
723       int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
724         uint32_t value);
725
726       The recursion_limit parameter is similar to match_limit, but instead of
727       limiting the total number of times that match() is  called,  it  limits
728       the  depth  of  recursion. The recursion depth is a smaller number than
729       the total number of calls, because not all calls to match() are  recur‐
730       sive.  This limit is of use only if it is set smaller than match_limit.
731
732       Limiting the recursion depth limits the amount of system stack that can
733       be used, or, when PCRE2 has been compiled to use  memory  on  the  heap
734       instead  of the stack, the amount of heap memory that can be used. This
735       limit is not relevant, and is ignored, when matching is done using  JIT
736       compiled  code.  However,  it  is supported by pcre2_dfa_match(), which
737       uses recursive function calls less frequently than  pcre2_match(),  but
738       which  can  be caused to use a lot of stack by a recursive pattern such
739       as /(.)(?1)/ matched to a very long string.
740
741       The default value for recursion_limit can be set when PCRE2  is  built;
742       the  default  default is the same value as the default for match_limit.
743       If the limit is exceeded, pcre2_match()  and  pcre2_dfa_match()  return
744       PCRE2_ERROR_RECURSIONLIMIT. A value for the recursion limit may also be
745       supplied by an item at the start of a pattern of the form
746
747         (*LIMIT_RECURSION=ddd)
748
749       where ddd is a decimal number.  However,  such  a  setting  is  ignored
750       unless ddd is less than the limit set by the caller of pcre2_match() or
751       pcre2_dfa_match() or, if no such limit is set, less than the default.
752
753       int pcre2_set_recursion_memory_management(
754         pcre2_match_context *mcontext,
755         void *(*private_malloc)(PCRE2_SIZE, void *),
756         void (*private_free)(void *, void *), void *memory_data);
757
758       This function sets up two additional custom memory management functions
759       for  use  by  pcre2_match()  when PCRE2 is compiled to use the heap for
760       remembering backtracking data, instead of recursive function calls that
761       use  the  system stack. There is a discussion about PCRE2's stack usage
762       in the pcre2stack documentation. See the pcre2build  documentation  for
763       details of how to build PCRE2.
764
765       Using  the  heap for recursion is a non-standard way of building PCRE2,
766       for use in environments  that  have  limited  stacks.  Because  of  the
767       greater use of memory management, pcre2_match() runs more slowly. Func‐
768       tions that are different to the general  custom  memory  functions  are
769       provided  so  that  special-purpose  external code can be used for this
770       case, because the memory blocks are all the same size. The  blocks  are
771       retained by pcre2_match() until it is about to exit so that they can be
772       re-used when possible during the match. In the absence of  these  func‐
773       tions,  the normal custom memory management functions are used, if sup‐
774       plied, otherwise the system functions.
775

CHECKING BUILD-TIME OPTIONS

777
778       int pcre2_config(uint32_t what, void *where);
779
780       The function pcre2_config() makes it possible for  a  PCRE2  client  to
781       discover  which  optional  features  have  been compiled into the PCRE2
782       library. The pcre2build documentation  has  more  details  about  these
783       optional features.
784
785       The  first  argument  for pcre2_config() specifies which information is
786       required. The second argument is a pointer to  memory  into  which  the
787       information  is  placed.  If  NULL  is passed, the function returns the
788       amount of memory that is needed  for  the  requested  information.  For
789       calls  that  return  numerical  values,  the  value  is  in bytes; when
790       requesting these values, where should point  to  appropriately  aligned
791       memory.  For calls that return strings, the required length is given in
792       code units, not counting the terminating zero.
793
794       When requesting information, the returned value from pcre2_config()  is
795       non-negative  on success, or the negative error code PCRE2_ERROR_BADOP‐
796       TION if the value in the first argument is not recognized. The  follow‐
797       ing information is available:
798
799         PCRE2_CONFIG_BSR
800
801       The  output  is a uint32_t integer whose value indicates what character
802       sequences the \R  escape  sequence  matches  by  default.  A  value  of
803       PCRE2_BSR_UNICODE  means  that  \R  matches  any  Unicode  line  ending
804       sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches  only  CR,
805       LF, or CRLF. The default can be overridden when a pattern is compiled.
806
807         PCRE2_CONFIG_JIT
808
809       The  output  is  a  uint32_t  integer that is set to one if support for
810       just-in-time compiling is available; otherwise it is set to zero.
811
812         PCRE2_CONFIG_JITTARGET
813
814       The where argument should point to a buffer that is at  least  48  code
815       units  long.  (The  exact  length  required  can  be  found  by calling
816       pcre2_config() with where set to NULL.) The buffer  is  filled  with  a
817       string  that  contains  the  name of the architecture for which the JIT
818       compiler is  configured,  for  example  "x86  32bit  (little  endian  +
819       unaligned)".  If JIT support is not available, PCRE2_ERROR_BADOPTION is
820       returned, otherwise the number of code units used is returned. This  is
821       the length of the string, plus one unit for the terminating zero.
822
823         PCRE2_CONFIG_LINKSIZE
824
825       The output is a uint32_t integer that contains the number of bytes used
826       for internal linkage in compiled regular  expressions.  When  PCRE2  is
827       configured,  the value can be set to 2, 3, or 4, with the default being
828       2. This is the value that is returned by pcre2_config(). However,  when
829       the  16-bit  library  is compiled, a value of 3 is rounded up to 4, and
830       when the 32-bit library is compiled, internal  linkages  always  use  4
831       bytes, so the configured value is not relevant.
832
833       The default value of 2 for the 8-bit and 16-bit libraries is sufficient
834       for all but the most massive patterns, since it allows the size of  the
835       compiled pattern to be up to 64K code units. Larger values allow larger
836       regular expressions to be compiled by those two libraries, but  at  the
837       expense of slower matching.
838
839         PCRE2_CONFIG_MATCHLIMIT
840
841       The  output  is a uint32_t integer that gives the default limit for the
842       number of internal matching function calls in  a  pcre2_match()  execu‐
843       tion. Further details are given with pcre2_match() below.
844
845         PCRE2_CONFIG_NEWLINE
846
847       The  output  is  a  uint32_t  integer whose value specifies the default
848       character sequence that is recognized as meaning "newline". The  values
849       are:
850
851         PCRE2_NEWLINE_CR       Carriage return (CR)
852         PCRE2_NEWLINE_LF       Linefeed (LF)
853         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
854         PCRE2_NEWLINE_ANY      Any Unicode line ending
855         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
856
857       The  default  should  normally  correspond to the standard sequence for
858       your operating system.
859
860         PCRE2_CONFIG_PARENSLIMIT
861
862       The output is a uint32_t integer that gives the maximum depth of  nest‐
863       ing of parentheses (of any kind) in a pattern. This limit is imposed to
864       cap the amount of system stack used when a pattern is compiled.  It  is
865       specified  when PCRE2 is built; the default is 250. This limit does not
866       take into account the stack that may already be  used  by  the  calling
867       application.  For  finer  control  over  compilation  stack  usage, see
868       pcre2_set_compile_recursion_guard().
869
870         PCRE2_CONFIG_RECURSIONLIMIT
871
872       The output is a uint32_t integer that gives the default limit  for  the
873       depth  of  recursion  when  calling the internal matching function in a
874       pcre2_match() execution. Further details are given  with  pcre2_match()
875       below.
876
877         PCRE2_CONFIG_STACKRECURSE
878
879       The  output is a uint32_t integer that is set to one if internal recur‐
880       sion when running pcre2_match() is implemented  by  recursive  function
881       calls  that  use  the system stack to remember their state. This is the
882       usual way that PCRE2 is compiled. The output is zero if PCRE2 was  com‐
883       piled  to  use blocks of data on the heap instead of recursive function
884       calls.
885
886         PCRE2_CONFIG_UNICODE_VERSION
887
888       The where argument should point to a buffer that is at  least  24  code
889       units  long.  (The  exact  length  required  can  be  found  by calling
890       pcre2_config() with where set to NULL.)  If  PCRE2  has  been  compiled
891       without  Unicode  support,  the buffer is filled with the text "Unicode
892       not supported". Otherwise, the Unicode  version  string  (for  example,
893       "8.0.0")  is  inserted. The number of code units used is returned. This
894       is the length of the string plus one unit for the terminating zero.
895
896         PCRE2_CONFIG_UNICODE
897
898       The output is a uint32_t integer that is set to one if Unicode  support
899       is  available; otherwise it is set to zero. Unicode support implies UTF
900       support.
901
902         PCRE2_CONFIG_VERSION
903
904       The where argument should point to a buffer that is at  least  12  code
905       units  long.  (The  exact  length  required  can  be  found  by calling
906       pcre2_config() with where set to NULL.) The buffer is filled  with  the
907       PCRE2 version string, zero-terminated. The number of code units used is
908       returned. This is the length of the string plus one unit for the termi‐
909       nating zero.
910

COMPILING A PATTERN

912
913       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
914         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
915         pcre2_compile_context *ccontext);
916
917       void pcre2_code_free(pcre2_code *code);
918
919       pcre2_code *pcre2_code_copy(const pcre2_code *code);
920
921       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
922
923       The  pcre2_compile() function compiles a pattern into an internal form.
924       The pattern is defined by a pointer to a string of  code  units  and  a
925       length.  If the pattern is zero-terminated, the length can be specified
926       as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block  of
927       memory  that contains the compiled pattern and related data, or NULL if
928       an error occurred.
929
930       If the compile context argument ccontext is NULL, memory for  the  com‐
931       piled  pattern  is  obtained  by  calling  malloc().  Otherwise,  it is
932       obtained from the same memory function that was used  for  the  compile
933       context.  The  caller must free the memory by calling pcre2_code_free()
934       when it is no longer needed.
935
936       The function pcre2_code_copy() makes a copy of the compiled code in new
937       memory,  using  the same memory allocator as was used for the original.
938       However, if the code has  been  processed  by  the  JIT  compiler  (see
939       below),  the  JIT information cannot be copied (because it is position-
940       dependent).  The new copy can initially be used only for non-JIT match‐
941       ing, though it can be passed to pcre2_jit_compile() if required.
942
943       The pcre2_code_copy() function provides a way for individual threads in
944       a multithreaded application to acquire a private copy  of  shared  com‐
945       piled  code.   However, it does not make a copy of the character tables
946       used by the compiled pattern; the new pattern code points to  the  same
947       tables  as  the original code.  (See "Locale Support" below for details
948       of these character tables.) In many applications the  same  tables  are
949       used  throughout, so this behaviour is appropriate. Nevertheless, there
950       are occasions when a copy of a compiled pattern and the relevant tables
951       are  needed.  The pcre2_code_copy_with_tables() provides this facility.
952       Copies of both the code and the tables are  made,  with  the  new  code
953       pointing  to the new tables. The memory for the new tables is automati‐
954       cally freed when pcre2_code_free() is called for the new  copy  of  the
955       compiled code.
956
957       NOTE:  When  one  of  the matching functions is called, pointers to the
958       compiled pattern and the subject string are set in the match data block
959       so  that  they can be referenced by the substring extraction functions.
960       After running a match, you must not free a compiled pattern (or a  sub‐
961       ject  string)  until  after all operations on the match data block have
962       taken place.
963
964       The options argument for pcre2_compile() contains various bit  settings
965       that  affect  the  compilation.  It  should  be  zero if no options are
966       required. The available options are described below. Some of  them  (in
967       particular,  those  that  are  compatible with Perl, but some others as
968       well) can also be set and  unset  from  within  the  pattern  (see  the
969       detailed description in the pcre2pattern documentation).
970
971       For  those options that can be different in different parts of the pat‐
972       tern, the contents of the options argument specifies their settings  at
973       the  start  of  compilation.  The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK
974       options can be set at the time of matching as well as at compile time.
975
976       Other, less frequently required compile-time parameters  (for  example,
977       the newline setting) can be provided in a compile context (as described
978       above).
979
980       If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme‐
981       diately.  Otherwise,  the  variables to which these point are set to an
982       error code and an offset (number of code  units)  within  the  pattern,
983       respectively,  when  pcre2_compile() returns NULL because a compilation
984       error has occurred. The values are not defined when compilation is suc‐
985       cessful and pcre2_compile() returns a non-NULL value.
986
987       The value returned in erroroffset is an indication of where in the pat‐
988       tern the error occurred. It is not necessarily the  furthest  point  in
989       the  pattern  that  was  read. For example, after the error "lookbehind
990       assertion is not fixed length", the error offset points to the start of
991       the failing assertion.
992
993       The  pcre2_get_error_message() function (see "Obtaining a textual error
994       message" below) provides a textual message for each error code.  Compi‐
995       lation errors have positive error codes; UTF formatting error codes are
996       negative. For an invalid UTF-8 or UTF-16 string, the offset is that  of
997       the first code unit of the failing character.
998
999       Some  errors are not detected until the whole pattern has been scanned;
1000       in these cases, the offset passed back is the length  of  the  pattern.
1001       Note  that  the  offset is in code units, not characters, even in a UTF
1002       mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char‐
1003       acter.
1004
1005       This  code  fragment shows a typical straightforward call to pcre2_com‐
1006       pile():
1007
1008         pcre2_code *re;
1009         PCRE2_SIZE erroffset;
1010         int errorcode;
1011         re = pcre2_compile(
1012           "^A.*Z",                /* the pattern */
1013           PCRE2_ZERO_TERMINATED,  /* the pattern is zero-terminated */
1014           0,                      /* default options */
1015           &errorcode,             /* for error code */
1016           &erroffset,             /* for error offset */
1017           NULL);                  /* no compile context */
1018
1019       The following names for option bits are defined in the  pcre2.h  header
1020       file:
1021
1022         PCRE2_ANCHORED
1023
1024       If this bit is set, the pattern is forced to be "anchored", that is, it
1025       is constrained to match only at the first matching point in the  string
1026       that  is being searched (the "subject string"). This effect can also be
1027       achieved by appropriate constructs in the pattern itself, which is  the
1028       only way to do it in Perl.
1029
1030         PCRE2_ALLOW_EMPTY_CLASS
1031
1032       By  default, for compatibility with Perl, a closing square bracket that
1033       immediately follows an opening one is treated as a data  character  for
1034       the  class.  When  PCRE2_ALLOW_EMPTY_CLASS  is  set,  it terminates the
1035       class, which therefore contains no characters and so can never match.
1036
1037         PCRE2_ALT_BSUX
1038
1039       This option request alternative handling  of  three  escape  sequences,
1040       which  makes  PCRE2's  behaviour more like ECMAscript (aka JavaScript).
1041       When it is set:
1042
1043       (1) \U matches an upper case "U" character; by default \U causes a com‐
1044       pile time error (Perl uses \U to upper case subsequent characters).
1045
1046       (2) \u matches a lower case "u" character unless it is followed by four
1047       hexadecimal digits, in which case the hexadecimal  number  defines  the
1048       code  point  to match. By default, \u causes a compile time error (Perl
1049       uses it to upper case the following character).
1050
1051       (3) \x matches a lower case "x" character unless it is followed by  two
1052       hexadecimal  digits,  in  which case the hexadecimal number defines the
1053       code point to match. By default, as in Perl, a  hexadecimal  number  is
1054       always expected after \x, but it may have zero, one, or two digits (so,
1055       for example, \xz matches a binary zero character followed by z).
1056
1057         PCRE2_ALT_CIRCUMFLEX
1058
1059       In  multiline  mode  (when  PCRE2_MULTILINE  is  set),  the  circumflex
1060       metacharacter  matches at the start of the subject (unless PCRE2_NOTBOL
1061       is set), and also after any internal  newline.  However,  it  does  not
1062       match after a newline at the end of the subject, for compatibility with
1063       Perl. If you want a multiline circumflex also to match after  a  termi‐
1064       nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
1065
1066         PCRE2_ALT_VERBNAMES
1067
1068       By  default, for compatibility with Perl, the name in any verb sequence
1069       such as (*MARK:NAME) is  any  sequence  of  characters  that  does  not
1070       include  a  closing  parenthesis. The name is not processed in any way,
1071       and it is not possible to include a closing parenthesis  in  the  name.
1072       However,  if  the  PCRE2_ALT_VERBNAMES  option is set, normal backslash
1073       processing is applied to verb  names  and  only  an  unescaped  closing
1074       parenthesis  terminates the name. A closing parenthesis can be included
1075       in a name either as \) or between \Q  and  \E.  If  the  PCRE2_EXTENDED
1076       option is set, unescaped whitespace in verb names is skipped and #-com‐
1077       ments are recognized, exactly as in the rest of the pattern.
1078
1079         PCRE2_AUTO_CALLOUT
1080
1081       If this bit  is  set,  pcre2_compile()  automatically  inserts  callout
1082       items,  all  with  number 255, before each pattern item, except immedi‐
1083       ately before or after a callout in the pattern. For discussion  of  the
1084       callout facility, see the pcre2callout documentation.
1085
1086         PCRE2_CASELESS
1087
1088       If  this  bit is set, letters in the pattern match both upper and lower
1089       case letters in the subject. It is equivalent to Perl's /i option,  and
1090       it can be changed within a pattern by a (?i) option setting.
1091
1092         PCRE2_DOLLAR_ENDONLY
1093
1094       If  this bit is set, a dollar metacharacter in the pattern matches only
1095       at the end of the subject string. Without this option,  a  dollar  also
1096       matches  immediately before a newline at the end of the string (but not
1097       before any other newlines). The PCRE2_DOLLAR_ENDONLY option is  ignored
1098       if  PCRE2_MULTILINE  is  set.  There is no equivalent to this option in
1099       Perl, and no way to set it within a pattern.
1100
1101         PCRE2_DOTALL
1102
1103       If this bit is set, a dot metacharacter  in  the  pattern  matches  any
1104       character,  including  one  that  indicates a newline. However, it only
1105       ever matches one character, even if newlines are coded as CRLF. Without
1106       this option, a dot does not match when the current position in the sub‐
1107       ject is at a newline. This option is equivalent to  Perl's  /s  option,
1108       and it can be changed within a pattern by a (?s) option setting. A neg‐
1109       ative class such as [^a] always matches newline characters, independent
1110       of the setting of this option.
1111
1112         PCRE2_DUPNAMES
1113
1114       If  this  bit is set, names used to identify capturing subpatterns need
1115       not be unique. This can be helpful for certain types of pattern when it
1116       is  known  that  only  one instance of the named subpattern can ever be
1117       matched. There are more details of named subpatterns  below;  see  also
1118       the pcre2pattern documentation.
1119
1120         PCRE2_EXTENDED
1121
1122       If  this  bit  is  set,  most white space characters in the pattern are
1123       totally ignored except when escaped or inside a character  class.  How‐
1124       ever,  white  space  is  not  allowed within sequences such as (?> that
1125       introduce various parenthesized subpatterns, nor within numerical quan‐
1126       tifiers  such  as {1,3}.  Ignorable white space is permitted between an
1127       item and a following quantifier and between a quantifier and a  follow‐
1128       ing + that indicates possessiveness.
1129
1130       PCRE2_EXTENDED  also causes characters between an unescaped # outside a
1131       character class and the next newline, inclusive, to be  ignored,  which
1132       makes it possible to include comments inside complicated patterns. Note
1133       that the end of this type of comment is a literal newline  sequence  in
1134       the pattern; escape sequences that happen to represent a newline do not
1135       count. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can  be
1136       changed within a pattern by a (?x) option setting.
1137
1138       Which characters are interpreted as newlines can be specified by a set‐
1139       ting in the compile context that is passed to pcre2_compile() or  by  a
1140       special  sequence at the start of the pattern, as described in the sec‐
1141       tion entitled "Newline conventions" in the pcre2pattern  documentation.
1142       A default is defined when PCRE2 is built.
1143
1144         PCRE2_FIRSTLINE
1145
1146       If  this  option  is  set,  an  unanchored pattern is required to match
1147       before or at the first  newline  in  the  subject  string,  though  the
1148       matched  text  may  continue  over the newline. See also PCRE2_USE_OFF‐
1149       SET_LIMIT,  which  provides  a  more  general  limiting  facility.   If
1150       PCRE2_FIRSTLINE  is set with an offset limit, a match must occur in the
1151       first line and also within the offset limit. In other words,  whichever
1152       limit comes first is used.
1153
1154         PCRE2_MATCH_UNSET_BACKREF
1155
1156       If  this  option  is set, a back reference to an unset subpattern group
1157       matches an empty string (by default this causes  the  current  matching
1158       alternative  to  fail).   A  pattern such as (\1)(a) succeeds when this
1159       option is set (assuming it can find an "a" in the subject), whereas  it
1160       fails  by  default,  for  Perl compatibility. Setting this option makes
1161       PCRE2 behave more like ECMAscript (aka JavaScript).
1162
1163         PCRE2_MULTILINE
1164
1165       By default, for the purposes of matching "start of line"  and  "end  of
1166       line",  PCRE2  treats the subject string as consisting of a single line
1167       of characters, even if it actually contains  newlines.  The  "start  of
1168       line"  metacharacter  (^)  matches only at the start of the string, and
1169       the "end of line" metacharacter ($) matches only  at  the  end  of  the
1170       string,  or  before  a  terminating  newline  (except  when  PCRE2_DOL‐
1171       LAR_ENDONLY is set). Note, however, that unless  PCRE2_DOTALL  is  set,
1172       the "any character" metacharacter (.) does not match at a newline. This
1173       behaviour (for ^, $, and dot) is the same as Perl.
1174
1175       When PCRE2_MULTILINE it is set, the "start of line" and "end  of  line"
1176       constructs  match  immediately following or immediately before internal
1177       newlines in the subject string, respectively, as well as  at  the  very
1178       start  and  end.  This is equivalent to Perl's /m option, and it can be
1179       changed within a pattern by a (?m) option setting. Note that the "start
1180       of line" metacharacter does not match after a newline at the end of the
1181       subject, for compatibility with Perl.  However, you can change this  by
1182       setting  the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a
1183       subject string, or no occurrences of ^  or  $  in  a  pattern,  setting
1184       PCRE2_MULTILINE has no effect.
1185
1186         PCRE2_NEVER_BACKSLASH_C
1187
1188       This  option  locks out the use of \C in the pattern that is being com‐
1189       piled.  This escape can  cause  unpredictable  behaviour  in  UTF-8  or
1190       UTF-16  modes,  because  it may leave the current matching point in the
1191       middle of a multi-code-unit character. This option  may  be  useful  in
1192       applications  that  process  patterns  from external sources. Note that
1193       there is also a build-time option that permanently locks out the use of
1194       \C.
1195
1196         PCRE2_NEVER_UCP
1197
1198       This  option  locks  out the use of Unicode properties for handling \B,
1199       \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
1200       described  for  the  PCRE2_UCP option below. In particular, it prevents
1201       the creator of the pattern from enabling this facility by starting  the
1202       pattern  with  (*UCP).  This  option may be useful in applications that
1203       process patterns from external sources. The option combination PCRE_UCP
1204       and PCRE_NEVER_UCP causes an error.
1205
1206         PCRE2_NEVER_UTF
1207
1208       This  option  locks out interpretation of the pattern as UTF-8, UTF-16,
1209       or UTF-32, depending on which library is in use. In particular, it pre‐
1210       vents  the  creator of the pattern from switching to UTF interpretation
1211       by starting the pattern with (*UTF).  This  option  may  be  useful  in
1212       applications  that process patterns from external sources. The combina‐
1213       tion of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
1214
1215         PCRE2_NO_AUTO_CAPTURE
1216
1217       If this option is set, it disables the use of numbered capturing paren‐
1218       theses  in the pattern. Any opening parenthesis that is not followed by
1219       ? behaves as if it were followed by ?: but named parentheses can  still
1220       be  used  for  capturing  (and  they acquire numbers in the usual way).
1221       There is no equivalent of this option  in  Perl.  Note  that,  if  this
1222       option  is  set,  references  to  capturing  groups (back references or
1223       recursion/subroutine calls) may only refer to named groups, though  the
1224       reference can be by name or by number.
1225
1226         PCRE2_NO_AUTO_POSSESS
1227
1228       If this option is set, it disables "auto-possessification", which is an
1229       optimization that, for example, turns a+b into a++b in order  to  avoid
1230       backtracks  into  a+ that can never be successful. However, if callouts
1231       are in use, auto-possessification means that some  callouts  are  never
1232       taken. You can set this option if you want the matching functions to do
1233       a full unoptimized search and run all the callouts, but  it  is  mainly
1234       provided for testing purposes.
1235
1236         PCRE2_NO_DOTSTAR_ANCHOR
1237
1238       If this option is set, it disables an optimization that is applied when
1239       .* is the first significant item in a top-level branch  of  a  pattern,
1240       and  all  the  other branches also start with .* or with \A or \G or ^.
1241       The optimization is automatically disabled for .* if it  is  inside  an
1242       atomic  group or a capturing group that is the subject of a back refer‐
1243       ence, or if the pattern contains (*PRUNE) or (*SKIP).  When  the  opti‐
1244       mization  is  not disabled, such a pattern is automatically anchored if
1245       PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set
1246       for  any  ^ items. Otherwise, the fact that any match must start either
1247       at the start of the subject or following a newline is remembered.  Like
1248       other optimizations, this can cause callouts to be skipped.
1249
1250         PCRE2_NO_START_OPTIMIZE
1251
1252       This  is  an  option whose main effect is at matching time. It does not
1253       change what pcre2_compile() generates, but it does affect the output of
1254       the JIT compiler.
1255
1256       There  are  a  number of optimizations that may occur at the start of a
1257       match, in order to speed up the process. For example, if  it  is  known
1258       that  an  unanchored  match  must  start with a specific character, the
1259       matching code searches the subject for that character, and fails  imme‐
1260       diately  if it cannot find it, without actually running the main match‐
1261       ing function. This means that a special item such as (*COMMIT)  at  the
1262       start  of  a  pattern is not considered until after a suitable starting
1263       point for the match has been found.  Also,  when  callouts  or  (*MARK)
1264       items  are  in use, these "start-up" optimizations can cause them to be
1265       skipped if the pattern is never actually used. The  start-up  optimiza‐
1266       tions  are  in effect a pre-scan of the subject that takes place before
1267       the pattern is run.
1268
1269       The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
1270       possibly  causing  performance  to  suffer,  but ensuring that in cases
1271       where the result is "no match", the callouts do occur, and  that  items
1272       such as (*COMMIT) and (*MARK) are considered at every possible starting
1273       position in the subject string.
1274
1275       Setting PCRE2_NO_START_OPTIMIZE may change the outcome  of  a  matching
1276       operation.  Consider the pattern
1277
1278         (*COMMIT)ABC
1279
1280       When  this  is compiled, PCRE2 records the fact that a match must start
1281       with the character "A". Suppose the subject  string  is  "DEFABC".  The
1282       start-up  optimization  scans along the subject, finds "A" and runs the
1283       first match attempt from there. The (*COMMIT) item means that the  pat‐
1284       tern  must  match the current starting position, which in this case, it
1285       does. However, if the same match is  run  with  PCRE2_NO_START_OPTIMIZE
1286       set,  the  initial  scan  along the subject string does not happen. The
1287       first match attempt is run starting  from  "D"  and  when  this  fails,
1288       (*COMMIT)  prevents  any  further  matches  being tried, so the overall
1289       result is "no match". There are also other start-up optimizations.  For
1290       example, a minimum length for the subject may be recorded. Consider the
1291       pattern
1292
1293         (*MARK:A)(X|Y)
1294
1295       The minimum length for a match is one  character.  If  the  subject  is
1296       "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
1297       to match an empty string at the end of the subject does not take place,
1298       because  PCRE2  knows  that  the  subject  is now too short, and so the
1299       (*MARK) is never encountered. In this case, the optimization  does  not
1300       affect the overall match result, which is still "no match", but it does
1301       affect the auxiliary information that is returned.
1302
1303         PCRE2_NO_UTF_CHECK
1304
1305       When PCRE2_UTF is set, the validity of the pattern as a UTF  string  is
1306       automatically  checked.  There  are  discussions  about the validity of
1307       UTF-8 strings, UTF-16 strings, and UTF-32 strings in  the  pcre2unicode
1308       document.  If an invalid UTF sequence is found, pcre2_compile() returns
1309       a negative error code.
1310
1311       If you know that your pattern is valid, and you want to skip this check
1312       for  performance  reasons,  you  can set the PCRE2_NO_UTF_CHECK option.
1313       When it is set, the effect of passing an invalid UTF string as  a  pat‐
1314       tern  is  undefined.  It  may cause your program to crash or loop. Note
1315       that  this  option  can   also   be   passed   to   pcre2_match()   and
1316       pcre_dfa_match(), to suppress validity checking of the subject string.
1317
1318         PCRE2_UCP
1319
1320       This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
1321       \w, and some of the POSIX character classes.  By  default,  only  ASCII
1322       characters  are recognized, but if PCRE2_UCP is set, Unicode properties
1323       are used instead to classify characters. More details are given in  the
1324       section on generic character types in the pcre2pattern page. If you set
1325       PCRE2_UCP, matching one of the items it affects takes much longer.  The
1326       option  is  available only if PCRE2 has been compiled with Unicode sup‐
1327       port.
1328
1329         PCRE2_UNGREEDY
1330
1331       This option inverts the "greediness" of the quantifiers  so  that  they
1332       are  not greedy by default, but become greedy if followed by "?". It is
1333       not compatible with Perl. It can also be set by a (?U)  option  setting
1334       within the pattern.
1335
1336         PCRE2_USE_OFFSET_LIMIT
1337
1338       This option must be set for pcre2_compile() if pcre2_set_offset_limit()
1339       is going to be used to set a non-default offset limit in a  match  con‐
1340       text  for  matches  that  use this pattern. An error is generated if an
1341       offset limit is set without this option.  For  more  details,  see  the
1342       description  of  pcre2_set_offset_limit() in the section that describes
1343       match contexts. See also the PCRE2_FIRSTLINE option above.
1344
1345         PCRE2_UTF
1346
1347       This option causes PCRE2 to regard both the  pattern  and  the  subject
1348       strings  that  are  subsequently processed as strings of UTF characters
1349       instead of single-code-unit strings. It  is  available  when  PCRE2  is
1350       built  to  include  Unicode  support (which is the default). If Unicode
1351       support is not available, the use of this  option  provokes  an  error.
1352       Details  of how this option changes the behaviour of PCRE2 are given in
1353       the pcre2unicode page.
1354

COMPILATION ERROR CODES

1356
1357       There are over 80 positive error codes that pcre2_compile() may  return
1358       (via  errorcode)  if  it  finds an error in the pattern. There are also
1359       some negative error codes that are used for invalid UTF strings.  These
1360       are  the  same as given by pcre2_match() and pcre2_dfa_match(), and are
1361       described in the pcre2unicode page. The pcre2_get_error_message() func‐
1362       tion  (see  "Obtaining a textual error message" below) can be called to
1363       obtain a textual error message from any error code.
1364

JUST-IN-TIME (JIT) COMPILATION

1366
1367       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
1368
1369       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
1370         PCRE2_SIZE length, PCRE2_SIZE startoffset,
1371         uint32_t options, pcre2_match_data *match_data,
1372         pcre2_match_context *mcontext);
1373
1374       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
1375
1376       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
1377         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
1378
1379       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
1380         pcre2_jit_callback callback_function, void *callback_data);
1381
1382       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
1383
1384       These functions provide support for  JIT  compilation,  which,  if  the
1385       just-in-time  compiler  is available, further processes a compiled pat‐
1386       tern into machine code that executes much faster than the pcre2_match()
1387       interpretive  matching function. Full details are given in the pcre2jit
1388       documentation.
1389
1390       JIT compilation is a heavyweight optimization. It can  take  some  time
1391       for  patterns  to  be analyzed, and for one-off matches and simple pat‐
1392       terns the benefit of faster execution might be offset by a much  slower
1393       compilation  time.   Most, but not all patterns can be optimized by the
1394       JIT compiler.
1395

LOCALE SUPPORT

1397
1398       PCRE2 handles caseless matching, and determines whether characters  are
1399       letters,  digits, or whatever, by reference to a set of tables, indexed
1400       by character code point. This applies only  to  characters  whose  code
1401       points  are  less than 256. By default, higher-valued code points never
1402       match escapes such as \w or \d.  However, if PCRE2 is  built  with  UTF
1403       support,  all  characters  can  be  tested with \p and \P, or, alterna‐
1404       tively, the PCRE2_UCP option can be set when  a  pattern  is  compiled;
1405       this  causes  \w and friends to use Unicode property support instead of
1406       the built-in tables.
1407
1408       The use of locales with Unicode is discouraged.  If  you  are  handling
1409       characters  with  code  points  greater than 128, you should either use
1410       Unicode support, or use locales, but not try to mix the two.
1411
1412       PCRE2 contains an internal set of character tables  that  are  used  by
1413       default.   These  are  sufficient  for many applications. Normally, the
1414       internal tables recognize only ASCII characters. However, when PCRE2 is
1415       built, it is possible to cause the internal tables to be rebuilt in the
1416       default "C" locale of the local system, which may cause them to be dif‐
1417       ferent.
1418
1419       The  internal tables can be overridden by tables supplied by the appli‐
1420       cation that calls PCRE2. These may be created  in  a  different  locale
1421       from  the  default.  As more and more applications change to using Uni‐
1422       code, the need for this locale support is expected to die away.
1423
1424       External tables are built by calling the  pcre2_maketables()  function,
1425       in  the relevant locale. The result can be passed to pcre2_compile() as
1426       often  as  necessary,  by  creating  a  compile  context  and   calling
1427       pcre2_set_character_tables()  to  set  the  tables pointer therein. For
1428       example, to build and use tables that are appropriate  for  the  French
1429       locale  (where  accented  characters  with  values greater than 128 are
1430       treated as letters), the following code could be used:
1431
1432         setlocale(LC_CTYPE, "fr_FR");
1433         tables = pcre2_maketables(NULL);
1434         ccontext = pcre2_compile_context_create(NULL);
1435         pcre2_set_character_tables(ccontext, tables);
1436         re = pcre2_compile(..., ccontext);
1437
1438       The locale name "fr_FR" is used on Linux and other  Unix-like  systems;
1439       if  you  are using Windows, the name for the French locale is "french".
1440       It is the caller's responsibility to ensure that the memory  containing
1441       the tables remains available for as long as it is needed.
1442
1443       The pointer that is passed (via the compile context) to pcre2_compile()
1444       is saved with the compiled pattern, and the same  tables  are  used  by
1445       pcre2_match()  and pcre_dfa_match(). Thus, for any single pattern, com‐
1446       pilation, and matching all happen in the  same  locale,  but  different
1447       patterns can be processed in different locales.
1448

INFORMATION ABOUT A COMPILED PATTERN

1450
1451       int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
1452
1453       The  pcre2_pattern_info()  function returns general information about a
1454       compiled pattern. For information about callouts, see the next section.
1455       The  first  argument  for pcre2_pattern_info() is a pointer to the com‐
1456       piled pattern. The second argument specifies which piece of information
1457       is  required,  and  the  third  argument  is a pointer to a variable to
1458       receive the data. If the third argument is NULL, the first argument  is
1459       ignored,  and  the  function  returns the size in bytes of the variable
1460       that is required for the information requested. Otherwise, The yield of
1461       the function is zero for success, or one of the following negative num‐
1462       bers:
1463
1464         PCRE2_ERROR_NULL           the argument code was NULL
1465         PCRE2_ERROR_BADMAGIC       the "magic number" was not found
1466         PCRE2_ERROR_BADOPTION      the value of what was invalid
1467         PCRE2_ERROR_UNSET          the requested field is not set
1468
1469       The "magic number" is placed at the start of each compiled  pattern  as
1470       an  simple check against passing an arbitrary memory pointer. Here is a
1471       typical call of pcre2_pattern_info(), to obtain the length of the  com‐
1472       piled pattern:
1473
1474         int rc;
1475         size_t length;
1476         rc = pcre2_pattern_info(
1477           re,               /* result of pcre2_compile() */
1478           PCRE2_INFO_SIZE,  /* what is required */
1479           &length);         /* where to put the data */
1480
1481       The possible values for the second argument are defined in pcre2.h, and
1482       are as follows:
1483
1484         PCRE2_INFO_ALLOPTIONS
1485         PCRE2_INFO_ARGOPTIONS
1486
1487       Return a copy of the pattern's options. The third argument should point
1488       to  a  uint32_t  variable.  PCRE2_INFO_ARGOPTIONS  returns  exactly the
1489       options that were passed to pcre2_compile(), whereas  PCRE2_INFO_ALLOP‐
1490       TIONS  returns  the compile options as modified by any top-level (*XXX)
1491       option settings such as (*UTF) at the start of the pattern itself.
1492
1493       For  example,  if  the  pattern  /(*UTF)abc/  is  compiled   with   the
1494       PCRE2_EXTENDED   option,   the   result  for  PCRE2_INFO_ALLOPTIONS  is
1495       PCRE2_EXTENDED and PCRE2_UTF.  Option settings such as  (?i)  that  can
1496       change  within  a pattern do not affect the result of PCRE2_INFO_ALLOP‐
1497       TIONS, even if they appear right at the start of the pattern. (This was
1498       different in some earlier releases.)
1499
1500       A  pattern compiled without PCRE2_ANCHORED is automatically anchored by
1501       PCRE2 if the first significant item in every top-level branch is one of
1502       the following:
1503
1504         ^     unless PCRE2_MULTILINE is set
1505         \A    always
1506         \G    always
1507         .*    sometimes - see below
1508
1509       When  .* is the first significant item, anchoring is possible only when
1510       all the following are true:
1511
1512         .* is not in an atomic group
1513         .* is not in a capturing group that is the subject
1514              of a back reference
1515         PCRE2_DOTALL is in force for .*
1516         Neither (*PRUNE) nor (*SKIP) appears in the pattern.
1517         PCRE2_NO_DOTSTAR_ANCHOR is not set.
1518
1519       For patterns that are auto-anchored, the PCRE2_ANCHORED bit is  set  in
1520       the options returned for PCRE2_INFO_ALLOPTIONS.
1521
1522         PCRE2_INFO_BACKREFMAX
1523
1524       Return  the  number  of  the highest back reference in the pattern. The
1525       third argument should point to an uint32_t variable. Named  subpatterns
1526       acquire  numbers  as well as names, and these count towards the highest
1527       back reference.  Back references such as \4 or \g{12}  match  the  cap‐
1528       tured  characters of the given group, but in addition, the check that a
1529       capturing group is set in a conditional subpattern such as (?(3)a|b) is
1530       also  a  back  reference.  Zero is returned if there are no back refer‐
1531       ences.
1532
1533         PCRE2_INFO_BSR
1534
1535       The output is a uint32_t whose value indicates what character sequences
1536       the \R escape sequence matches. A value of PCRE2_BSR_UNICODE means that
1537       \R matches any Unicode line ending sequence; a value of  PCRE2_BSR_ANY‐
1538       CRLF means that \R matches only CR, LF, or CRLF.
1539
1540         PCRE2_INFO_CAPTURECOUNT
1541
1542       Return  the highest capturing subpattern number in the pattern. In pat‐
1543       terns where (?| is not used, this is also the total number of capturing
1544       subpatterns.  The third argument should point to an uint32_t variable.
1545
1546         PCRE2_INFO_FIRSTBITMAP
1547
1548       In  the absence of a single first code unit for a non-anchored pattern,
1549       pcre2_compile() may construct a 256-bit table that defines a fixed  set
1550       of  values for the first code unit in any match. For example, a pattern
1551       that starts with [abc] results in a table with  three  bits  set.  When
1552       code  unit  values greater than 255 are supported, the flag bit for 255
1553       means "any code unit of value 255 or above". If such a table  was  con‐
1554       structed,  a pointer to it is returned. Otherwise NULL is returned. The
1555       third argument should point to an const uint8_t * variable.
1556
1557         PCRE2_INFO_FIRSTCODETYPE
1558
1559       Return information about the first code unit of any matched string, for
1560       a  non-anchored pattern. The third argument should point to an uint32_t
1561       variable. If there is a fixed first value, for example, the letter  "c"
1562       from a pattern such as (cat|cow|coyote), 1 is returned, and the charac‐
1563       ter value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there  is
1564       no  fixed  first  value, but it is known that a match can occur only at
1565       the start of the subject or following a newline in the  subject,  2  is
1566       returned. Otherwise, and for anchored patterns, 0 is returned.
1567
1568         PCRE2_INFO_FIRSTCODEUNIT
1569
1570       Return  the  value  of the first code unit of any matched string in the
1571       situation where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
1572       The  third  argument should point to an uint32_t variable. In the 8-bit
1573       library, the value is always less than 256. In the 16-bit  library  the
1574       value  can  be  up  to 0xffff. In the 32-bit library in UTF-32 mode the
1575       value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
1576       mode.
1577
1578         PCRE2_INFO_HASBACKSLASHC
1579
1580       Return  1 if the pattern contains any instances of \C, otherwise 0. The
1581       third argument should point to an uint32_t variable.
1582
1583         PCRE2_INFO_HASCRORLF
1584
1585       Return 1 if the pattern contains any explicit  matches  for  CR  or  LF
1586       characters, otherwise 0. The third argument should point to an uint32_t
1587       variable. An explicit match is either a literal CR or LF character,  or
1588       \r or \n.
1589
1590         PCRE2_INFO_JCHANGED
1591
1592       Return  1  if  the (?J) or (?-J) option setting is used in the pattern,
1593       otherwise 0. The third argument should point to an  uint32_t  variable.
1594       (?J)  and  (?-J) set and unset the local PCRE2_DUPNAMES option, respec‐
1595       tively.
1596
1597         PCRE2_INFO_JITSIZE
1598
1599       If the compiled pattern was successfully  processed  by  pcre2_jit_com‐
1600       pile(),  return  the  size  of  the JIT compiled code, otherwise return
1601       zero. The third argument should point to a size_t variable.
1602
1603         PCRE2_INFO_LASTCODETYPE
1604
1605       Returns 1 if there is a rightmost literal code unit that must exist  in
1606       any  matched string, other than at its start. The third argument should
1607       point to an uint32_t  variable.  If  there  is  no  such  value,  0  is
1608       returned.  When  1  is  returned,  the  code  unit  value itself can be
1609       retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a  last
1610       literal  value  is  recorded  only  if it follows something of variable
1611       length. For example, for the pattern /^a\d+z\d+/ the returned value  is
1612       1  (with  "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/
1613       the returned value is 0.
1614
1615         PCRE2_INFO_LASTCODEUNIT
1616
1617       Return the value of the rightmost literal data unit that must exist  in
1618       any  matched  string, other than at its start, if such a value has been
1619       recorded. The third argument should point to an uint32_t  variable.  If
1620       there is no such value, 0 is returned.
1621
1622         PCRE2_INFO_MATCHEMPTY
1623
1624       Return  1  if the pattern might match an empty string, otherwise 0. The
1625       third argument should point to an uint32_t  variable.  When  a  pattern
1626       contains recursive subroutine calls it is not always possible to deter‐
1627       mine whether or not it can match an empty string. PCRE2  takes  a  cau‐
1628       tious approach and returns 1 in such cases.
1629
1630         PCRE2_INFO_MATCHLIMIT
1631
1632       If  the  pattern  set  a  match  limit by including an item of the form
1633       (*LIMIT_MATCH=nnnn) at the start, the  value  is  returned.  The  third
1634       argument  should  point to an unsigned 32-bit integer. If no such value
1635       has been set,  the  call  to  pcre2_pattern_info()  returns  the  error
1636       PCRE2_ERROR_UNSET.
1637
1638         PCRE2_INFO_MAXLOOKBEHIND
1639
1640       Return the number of characters (not code units) in the longest lookbe‐
1641       hind assertion in the pattern. The third argument should  point  to  an
1642       unsigned  32-bit  integer. This information is useful when doing multi-
1643       segment matching using the partial matching facilities. Note  that  the
1644       simple assertions \b and \B require a one-character lookbehind. \A also
1645       registers a one-character  lookbehind,  though  it  does  not  actually
1646       inspect  the  previous  character.  This is to ensure that at least one
1647       character from the old segment is retained when a new segment  is  pro‐
1648       cessed. Otherwise, if there are no lookbehinds in the pattern, \A might
1649       match incorrectly at the start of a new segment.
1650
1651         PCRE2_INFO_MINLENGTH
1652
1653       If a minimum length for matching  subject  strings  was  computed,  its
1654       value  is  returned.  Otherwise the returned value is 0. The value is a
1655       number of characters, which in UTF mode may be different from the  num‐
1656       ber  of  code  units.   The  third argument should point to an uint32_t
1657       variable. The value is a lower bound to  the  length  of  any  matching
1658       string.  There  may  not be any strings of that length that do actually
1659       match, but every string that does match is at least that long.
1660
1661         PCRE2_INFO_NAMECOUNT
1662         PCRE2_INFO_NAMEENTRYSIZE
1663         PCRE2_INFO_NAMETABLE
1664
1665       PCRE2 supports the use of named as well as numbered capturing parenthe‐
1666       ses.  The names are just an additional way of identifying the parenthe‐
1667       ses, which still acquire numbers. Several convenience functions such as
1668       pcre2_substring_get_byname()  are provided for extracting captured sub‐
1669       strings by name. It is also possible to extract the data  directly,  by
1670       first  converting  the  name to a number in order to access the correct
1671       pointers in the output vector (described with pcre2_match() below).  To
1672       do  the  conversion,  you  need to use the name-to-number map, which is
1673       described by these three values.
1674
1675       The map consists of a number of  fixed-size  entries.  PCRE2_INFO_NAME‐
1676       COUNT  gives  the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
1677       the size of each entry in code units; both of these return  a  uint32_t
1678       value. The entry size depends on the length of the longest name.
1679
1680       PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
1681       This is a PCRE2_SPTR pointer to a block of code  units.  In  the  8-bit
1682       library,  the  first two bytes of each entry are the number of the cap‐
1683       turing parenthesis, most significant byte first. In the 16-bit library,
1684       the  pointer  points  to 16-bit code units, the first of which contains
1685       the parenthesis number. In the 32-bit library, the  pointer  points  to
1686       32-bit  code units, the first of which contains the parenthesis number.
1687       The rest of the entry is the corresponding name, zero terminated.
1688
1689       The names are in alphabetical order. If (?| is used to create  multiple
1690       groups  with  the same number, as described in the section on duplicate
1691       subpattern numbers in the pcre2pattern page, the groups  may  be  given
1692       the  same  name,  but  there  is only one entry in the table. Different
1693       names for groups of the same number are not permitted.
1694
1695       Duplicate names for subpatterns with different numbers  are  permitted,
1696       but  only  if  PCRE2_DUPNAMES  is  set. They appear in the table in the
1697       order in which they were found in the pattern. In the  absence  of  (?|
1698       this  is  the  order of increasing number; when (?| is used this is not
1699       necessarily the case because later subpatterns may have lower numbers.
1700
1701       As a simple example of the name/number table,  consider  the  following
1702       pattern  after  compilation by the 8-bit library (assume PCRE2_EXTENDED
1703       is set, so white space - including newlines - is ignored):
1704
1705         (?<date> (?<year>(\d\d)?\d\d) -
1706         (?<month>\d\d) - (?<day>\d\d) )
1707
1708       There are four named subpatterns, so the table has  four  entries,  and
1709       each  entry  in the table is eight bytes long. The table is as follows,
1710       with non-printing bytes shows in hexadecimal, and undefined bytes shown
1711       as ??:
1712
1713         00 01 d  a  t  e  00 ??
1714         00 05 d  a  y  00 ?? ??
1715         00 04 m  o  n  t  h  00
1716         00 02 y  e  a  r  00 ??
1717
1718       When  writing  code  to  extract  data from named subpatterns using the
1719       name-to-number map, remember that the length of the entries  is  likely
1720       to be different for each compiled pattern.
1721
1722         PCRE2_INFO_NEWLINE
1723
1724       The output is a uint32_t with one of the following values:
1725
1726         PCRE2_NEWLINE_CR       Carriage return (CR)
1727         PCRE2_NEWLINE_LF       Linefeed (LF)
1728         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
1729         PCRE2_NEWLINE_ANY      Any Unicode line ending
1730         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
1731
1732       This  specifies  the default character sequence that will be recognized
1733       as meaning "newline" while matching.
1734
1735         PCRE2_INFO_RECURSIONLIMIT
1736
1737       If the pattern set a recursion limit by including an item of  the  form
1738       (*LIMIT_RECURSION=nnnn)  at the start, the value is returned. The third
1739       argument should point to an unsigned 32-bit integer. If no  such  value
1740       has  been  set,  the  call  to  pcre2_pattern_info()  returns the error
1741       PCRE2_ERROR_UNSET.
1742
1743         PCRE2_INFO_SIZE
1744
1745       Return the size of  the  compiled  pattern  in  bytes  (for  all  three
1746       libraries).  The third argument should point to a size_t variable. This
1747       value includes the size of the general data  block  that  precedes  the
1748       code  units of the compiled pattern itself. The value that is used when
1749       pcre2_compile() is getting memory in which to place the  compiled  pat‐
1750       tern  may  be  slightly  larger than the value returned by this option,
1751       because there are cases where the code that calculates the size has  to
1752       over-estimate.  Processing  a  pattern  with  the JIT compiler does not
1753       alter the value returned by this option.
1754

INFORMATION ABOUT A PATTERN'S CALLOUTS

1756
1757       int pcre2_callout_enumerate(const pcre2_code *code,
1758         int (*callback)(pcre2_callout_enumerate_block *, void *),
1759         void *user_data);
1760
1761       A script language that supports the use of string arguments in callouts
1762       might  like  to  scan  all the callouts in a pattern before running the
1763       match. This can be done by calling pcre2_callout_enumerate(). The first
1764       argument  is  a  pointer  to a compiled pattern, the second points to a
1765       callback function, and the third is arbitrary user data.  The  callback
1766       function  is  called  for  every callout in the pattern in the order in
1767       which they appear. Its first argument is a pointer to a callout enumer‐
1768       ation  block,  and  its second argument is the user_data value that was
1769       passed to pcre2_callout_enumerate(). The contents of the  callout  enu‐
1770       meration  block  are described in the pcre2callout documentation, which
1771       also gives further details about callouts.
1772

SERIALIZATION AND PRECOMPILING

1774
1775       It is possible to save compiled patterns  on  disc  or  elsewhere,  and
1776       reload  them  later, subject to a number of restrictions. The functions
1777       whose names begin with pcre2_serialize_ are used for this purpose. They
1778       are described in the pcre2serialize documentation.
1779

THE MATCH DATA BLOCK

1781
1782       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
1783         pcre2_general_context *gcontext);
1784
1785       pcre2_match_data *pcre2_match_data_create_from_pattern(
1786         const pcre2_code *code, pcre2_general_context *gcontext);
1787
1788       void pcre2_match_data_free(pcre2_match_data *match_data);
1789
1790       Information  about  a  successful  or unsuccessful match is placed in a
1791       match data block, which is an opaque  structure  that  is  accessed  by
1792       function  calls.  In particular, the match data block contains a vector
1793       of offsets into the subject string that define the matched part of  the
1794       subject  and  any  substrings  that were captured. This is known as the
1795       ovector.
1796
1797       Before calling pcre2_match(), pcre2_dfa_match(),  or  pcre2_jit_match()
1798       you must create a match data block by calling one of the creation func‐
1799       tions above. For pcre2_match_data_create(), the first argument  is  the
1800       number  of  pairs  of  offsets  in  the ovector. One pair of offsets is
1801       required to identify the string that matched the  whole  pattern,  with
1802       another  pair  for  each  captured substring. For example, a value of 4
1803       creates enough space to record the matched portion of the subject  plus
1804       three  captured  substrings. A minimum of at least 1 pair is imposed by
1805       pcre2_match_data_create(), so it is always possible to return the over‐
1806       all matched string.
1807
1808       The second argument of pcre2_match_data_create() is a pointer to a gen‐
1809       eral context, which can specify custom memory management for  obtaining
1810       the memory for the match data block. If you are not using custom memory
1811       management, pass NULL, which causes malloc() to be used.
1812
1813       For pcre2_match_data_create_from_pattern(), the  first  argument  is  a
1814       pointer to a compiled pattern. The ovector is created to be exactly the
1815       right size to hold all the substrings a pattern might capture. The sec‐
1816       ond  argument is again a pointer to a general context, but in this case
1817       if NULL is passed, the memory is obtained using the same allocator that
1818       was used for the compiled pattern (custom or default).
1819
1820       A  match  data block can be used many times, with the same or different
1821       compiled patterns. You can extract information from a match data  block
1822       after  a  match  operation  has  finished,  using  functions  that  are
1823       described in the sections on  matched  strings  and  other  match  data
1824       below.
1825
1826       When  a  call  of  pcre2_match()  fails, valid data is available in the
1827       match   block   only   when   the   error    is    PCRE2_ERROR_NOMATCH,
1828       PCRE2_ERROR_PARTIAL,  or  one  of  the  error  codes for an invalid UTF
1829       string. Exactly what is available depends on the error, and is detailed
1830       below.
1831
1832       When  one of the matching functions is called, pointers to the compiled
1833       pattern and the subject string are set in the match data block so  that
1834       they  can  be  referenced  by the extraction functions. After running a
1835       match, you must not free a compiled pattern or a subject  string  until
1836       after  all  operations  on  the  match data block (for that match) have
1837       taken place.
1838
1839       When a match data block itself is no longer needed, it should be  freed
1840       by calling pcre2_match_data_free().
1841

MATCHING A PATTERN: THE TRADITIONAL FUNCTION

1843
1844       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
1845         PCRE2_SIZE length, PCRE2_SIZE startoffset,
1846         uint32_t options, pcre2_match_data *match_data,
1847         pcre2_match_context *mcontext);
1848
1849       The  function pcre2_match() is called to match a subject string against
1850       a compiled pattern, which is passed in the code argument. You can  call
1851       pcre2_match() with the same code argument as many times as you like, in
1852       order to find multiple matches in the subject string or to  match  dif‐
1853       ferent subject strings with the same pattern.
1854
1855       This  function  is  the  main  matching facility of the library, and it
1856       operates in a Perl-like manner. For specialist use  there  is  also  an
1857       alternative  matching function, which is described below in the section
1858       about the pcre2_dfa_match() function.
1859
1860       Here is an example of a simple call to pcre2_match():
1861
1862         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
1863         int rc = pcre2_match(
1864           re,             /* result of pcre2_compile() */
1865           "some string",  /* the subject string */
1866           11,             /* the length of the subject string */
1867           0,              /* start at offset 0 in the subject */
1868           0,              /* default options */
1869           match_data,     /* the match data block */
1870           NULL);          /* a match context; NULL means use defaults */
1871
1872       If the subject string is zero-terminated, the length can  be  given  as
1873       PCRE2_ZERO_TERMINATED. A match context must be provided if certain less
1874       common matching parameters are to be changed. For details, see the sec‐
1875       tion on the match context above.
1876
1877   The string to be matched by pcre2_match()
1878
1879       The  subject string is passed to pcre2_match() as a pointer in subject,
1880       a length in length, and a starting offset in  startoffset.  The  length
1881       and  offset  are  in  code units, not characters.  That is, they are in
1882       bytes for the 8-bit library, 16-bit code units for the 16-bit  library,
1883       and  32-bit  code units for the 32-bit library, whether or not UTF pro‐
1884       cessing is enabled.
1885
1886       If startoffset is greater than the length of the subject, pcre2_match()
1887       returns  PCRE2_ERROR_BADOFFSET.  When  the starting offset is zero, the
1888       search for a match starts at the beginning of the subject, and this  is
1889       by far the most common case. In UTF-8 or UTF-16 mode, the starting off‐
1890       set must point to the start of a character, or to the end of  the  sub‐
1891       ject  (in  UTF-32 mode, one code unit equals one character, so all off‐
1892       sets are valid). Like the  pattern  string,  the  subject  may  contain
1893       binary zeroes.
1894
1895       A  non-zero  starting offset is useful when searching for another match
1896       in the same subject by calling pcre2_match()  again  after  a  previous
1897       success.   Setting  startoffset  differs  from passing over a shortened
1898       string and setting PCRE2_NOTBOL in the case of a  pattern  that  begins
1899       with any kind of lookbehind. For example, consider the pattern
1900
1901         \Biss\B
1902
1903       which  finds  occurrences  of "iss" in the middle of words. (\B matches
1904       only if the current position in the subject is not  a  word  boundary.)
1905       When applied to the string "Mississipi" the first call to pcre2_match()
1906       finds the first occurrence. If pcre2_match() is called again with  just
1907       the  remainder  of  the  subject,  namely  "issipi", it does not match,
1908       because \B is always false at the start of the subject, which is deemed
1909       to  be  a word boundary. However, if pcre2_match() is passed the entire
1910       string again, but with startoffset set to 4, it finds the second occur‐
1911       rence  of "iss" because it is able to look behind the starting point to
1912       discover that it is preceded by a letter.
1913
1914       Finding all the matches in a subject is tricky  when  the  pattern  can
1915       match an empty string. It is possible to emulate Perl's /g behaviour by
1916       first  trying  the  match  again  at  the   same   offset,   with   the
1917       PCRE2_NOTEMPTY_ATSTART  and  PCRE2_ANCHORED  options,  and then if that
1918       fails, advancing the starting  offset  and  trying  an  ordinary  match
1919       again.  There  is  some  code  that  demonstrates how to do this in the
1920       pcre2demo sample program. In the most general case, you have  to  check
1921       to  see  if the newline convention recognizes CRLF as a newline, and if
1922       so, and the current character is CR followed by LF, advance the  start‐
1923       ing offset by two characters instead of one.
1924
1925       If  a  non-zero starting offset is passed when the pattern is anchored,
1926       one attempt to match at the given offset is made. This can only succeed
1927       if  the  pattern  does  not require the match to be at the start of the
1928       subject.
1929
1930   Option bits for pcre2_match()
1931
1932       The unused bits of the options argument for pcre2_match() must be zero.
1933       The  only  bits  that  may  be  set  are  PCRE2_ANCHORED, PCRE2_NOTBOL,
1934       PCRE2_NOTEOL,  PCRE2_NOTEMPTY,  PCRE2_NOTEMPTY_ATSTART,   PCRE2_NO_JIT,
1935       PCRE2_NO_UTF_CHECK,  PCRE2_PARTIAL_HARD,  and PCRE2_PARTIAL_SOFT. Their
1936       action is described below.
1937
1938       Setting PCRE2_ANCHORED at match time is not supported by  the  just-in-
1939       time  (JIT)  compiler.  If  it is set, JIT matching is disabled and the
1940       normal  interpretive  code  in  pcre2_match()  is   run.   Apart   from
1941       PCRE2_NO_JIT  (obviously),  the remaining options are supported for JIT
1942       matching.
1943
1944         PCRE2_ANCHORED
1945
1946       The PCRE2_ANCHORED option limits pcre2_match() to matching at the first
1947       matching  position.  If  a pattern was compiled with PCRE2_ANCHORED, or
1948       turned out to be anchored by virtue of its contents, it cannot be  made
1949       unachored  at matching time. Note that setting the option at match time
1950       disables JIT matching.
1951
1952         PCRE2_NOTBOL
1953
1954       This option specifies that first character of the subject string is not
1955       the  beginning  of  a  line, so the circumflex metacharacter should not
1956       match before it. Setting this without  having  set  PCRE2_MULTILINE  at
1957       compile time causes circumflex never to match. This option affects only
1958       the behaviour of the circumflex metacharacter. It does not affect \A.
1959
1960         PCRE2_NOTEOL
1961
1962       This option specifies that the end of the subject string is not the end
1963       of  a line, so the dollar metacharacter should not match it nor (except
1964       in multiline mode) a newline immediately before it. Setting this  with‐
1965       out  having  set PCRE2_MULTILINE at compile time causes dollar never to
1966       match. This option affects only the behaviour of the dollar metacharac‐
1967       ter. It does not affect \Z or \z.
1968
1969         PCRE2_NOTEMPTY
1970
1971       An empty string is not considered to be a valid match if this option is
1972       set. If there are alternatives in the pattern, they are tried.  If  all
1973       the  alternatives  match  the empty string, the entire match fails. For
1974       example, if the pattern
1975
1976         a?b?
1977
1978       is applied to a string not beginning with "a" or  "b",  it  matches  an
1979       empty string at the start of the subject. With PCRE2_NOTEMPTY set, this
1980       match is not valid, so pcre2_match() searches further into  the  string
1981       for occurrences of "a" or "b".
1982
1983         PCRE2_NOTEMPTY_ATSTART
1984
1985       This  is  like PCRE2_NOTEMPTY, except that it locks out an empty string
1986       match only at the first matching position, that is, at the start of the
1987       subject  plus  the  starting offset. An empty string match later in the
1988       subject is permitted.  If the pattern is anchored,  such  a  match  can
1989       occur only if the pattern contains \K.
1990
1991         PCRE2_NO_JIT
1992
1993       By   default,   if   a  pattern  has  been  successfully  processed  by
1994       pcre2_jit_compile(), JIT is automatically used  when  pcre2_match()  is
1995       called  with  options  that JIT supports. Setting PCRE2_NO_JIT disables
1996       the use of JIT; it forces matching to be done by the interpreter.
1997
1998         PCRE2_NO_UTF_CHECK
1999
2000       When PCRE2_UTF is set at compile time, the validity of the subject as a
2001       UTF  string  is  checked  by default when pcre2_match() is subsequently
2002       called.  If a non-zero starting offset is given, the check  is  applied
2003       only  to that part of the subject that could be inspected during match‐
2004       ing, and there is a check that the starting offset points to the  first
2005       code  unit of a character or to the end of the subject. If there are no
2006       lookbehind assertions in the pattern, the check starts at the  starting
2007       offset.  Otherwise,  it  starts at the length of the longest lookbehind
2008       before the starting offset, or at the start of the subject if there are
2009       not  that  many  characters  before  the starting offset. Note that the
2010       sequences \b and \B are one-character lookbehinds.
2011
2012       The check is carried out before any other processing takes place, and a
2013       negative  error  code is returned if the check fails. There are several
2014       UTF error codes for each code unit width,  corresponding  to  different
2015       problems  with  the code unit sequence. There are discussions about the
2016       validity of UTF-8 strings, UTF-16 strings, and UTF-32  strings  in  the
2017       pcre2unicode page.
2018
2019       If  you  know  that  your  subject is valid, and you want to skip these
2020       checks for performance reasons,  you  can  set  the  PCRE2_NO_UTF_CHECK
2021       option  when  calling  pcre2_match(). You might want to do this for the
2022       second and subsequent calls to pcre2_match() if you are making repeated
2023       calls to find all the matches in a single subject string.
2024
2025       NOTE:  When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid
2026       string as a subject, or an invalid value of startoffset, is  undefined.
2027       Your program may crash or loop indefinitely.
2028
2029         PCRE2_PARTIAL_HARD
2030         PCRE2_PARTIAL_SOFT
2031
2032       These  options  turn  on  the partial matching feature. A partial match
2033       occurs if the end of the subject string is  reached  successfully,  but
2034       there  are not enough subject characters to complete the match. If this
2035       happens when PCRE2_PARTIAL_SOFT (but not  PCRE2_PARTIAL_HARD)  is  set,
2036       matching  continues  by  testing any remaining alternatives. Only if no
2037       complete match can be found is PCRE2_ERROR_PARTIAL returned instead  of
2038       PCRE2_ERROR_NOMATCH.  In other words, PCRE2_PARTIAL_SOFT specifies that
2039       the caller is prepared to handle a partial match, but only if  no  com‐
2040       plete match can be found.
2041
2042       If  PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this
2043       case, if a partial match is found,  pcre2_match()  immediately  returns
2044       PCRE2_ERROR_PARTIAL,  without  considering  any  other alternatives. In
2045       other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid‐
2046       ered to be more important that an alternative complete match.
2047
2048       There is a more detailed discussion of partial and multi-segment match‐
2049       ing, with examples, in the pcre2partial documentation.
2050

NEWLINE HANDLING WHEN MATCHING

2052
2053       When PCRE2 is built, a default newline convention is set; this is  usu‐
2054       ally  the standard convention for the operating system. The default can
2055       be overridden in a compile context by calling  pcre2_set_newline().  It
2056       can  also be overridden by starting a pattern string with, for example,
2057       (*CRLF), as described in the section  on  newline  conventions  in  the
2058       pcre2pattern  page. During matching, the newline choice affects the be‐
2059       haviour of the dot, circumflex, and dollar metacharacters. It may  also
2060       alter  the  way  the  match starting position is advanced after a match
2061       failure for an unanchored pattern.
2062
2063       When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is
2064       set  as  the  newline convention, and a match attempt for an unanchored
2065       pattern fails when the current starting position is at a CRLF sequence,
2066       and  the  pattern contains no explicit matches for CR or LF characters,
2067       the match position is advanced by two characters  instead  of  one,  in
2068       other words, to after the CRLF.
2069
2070       The above rule is a compromise that makes the most common cases work as
2071       expected. For example, if the pattern  is  .+A  (and  the  PCRE2_DOTALL
2072       option is not set), it does not match the string "\r\nA" because, after
2073       failing at the start, it skips both the CR and the LF before  retrying.
2074       However,  the  pattern  [\r\n]A does match that string, because it con‐
2075       tains an explicit CR or LF reference, and so advances only by one char‐
2076       acter after the first failure.
2077
2078       An explicit match for CR of LF is either a literal appearance of one of
2079       those characters in the  pattern,  or  one  of  the  \r  or  \n  escape
2080       sequences.  Implicit  matches  such  as [^X] do not count, nor does \s,
2081       even though it includes CR and LF in the characters that it matches.
2082
2083       Notwithstanding the above, anomalous effects may still occur when  CRLF
2084       is a valid newline sequence and explicit \r or \n escapes appear in the
2085       pattern.
2086

HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

2088
2089       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
2090
2091       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
2092
2093       In general, a pattern matches a certain portion of the subject, and  in
2094       addition,  further  substrings  from  the  subject may be picked out by
2095       parenthesized parts of the pattern.  Following  the  usage  in  Jeffrey
2096       Friedl's  book,  this  is  called  "capturing" in what follows, and the
2097       phrase "capturing subpattern" or "capturing group" is used for a  frag‐
2098       ment  of  a  pattern that picks out a substring. PCRE2 supports several
2099       other kinds of parenthesized subpattern that do not cause substrings to
2100       be  captured. The pcre2_pattern_info() function can be used to find out
2101       how many capturing subpatterns there are in a compiled pattern.
2102
2103       You can use auxiliary functions for accessing  captured  substrings  by
2104       number or by name, as described in sections below.
2105
2106       Alternatively, you can make direct use of the vector of PCRE2_SIZE val‐
2107       ues, called  the  ovector,  which  contains  the  offsets  of  captured
2108       strings.   It   is   part  of  the  match  data  block.   The  function
2109       pcre2_get_ovector_pointer() returns the address  of  the  ovector,  and
2110       pcre2_get_ovector_count() returns the number of pairs of values it con‐
2111       tains.
2112
2113       Within the ovector, the first in each pair of values is set to the off‐
2114       set of the first code unit of a substring, and the second is set to the
2115       offset of the first code unit after the end of a substring. These  val‐
2116       ues  are always code unit offsets, not character offsets. That is, they
2117       are byte offsets in the 8-bit library, 16-bit  offsets  in  the  16-bit
2118       library, and 32-bit offsets in the 32-bit library.
2119
2120       After  a  partial  match  (error  return PCRE2_ERROR_PARTIAL), only the
2121       first pair of offsets (that is, ovector[0]  and  ovector[1])  are  set.
2122       They  identify  the part of the subject that was partially matched. See
2123       the pcre2partial documentation for details of partial matching.
2124
2125       After a successful match, the first pair of offsets identifies the por‐
2126       tion  of the subject string that was matched by the entire pattern. The
2127       next pair is used for the first capturing subpattern, and  so  on.  The
2128       value  returned  by pcre2_match() is one more than the highest numbered
2129       pair that has been set. For example, if two substrings have  been  cap‐
2130       tured,  the returned value is 3. If there are no capturing subpatterns,
2131       the return value from a successful match is 1, indicating that just the
2132       first pair of offsets has been set.
2133
2134       If  a  pattern uses the \K escape sequence within a positive assertion,
2135       the reported start of a successful match can be greater than the end of
2136       the  match.   For  example,  if the pattern (?=ab\K) is matched against
2137       "ab", the start and end offset values for the match are 2 and 0.
2138
2139       If a capturing subpattern group is matched repeatedly within  a  single
2140       match  operation, it is the last portion of the subject that it matched
2141       that is returned.
2142
2143       If the ovector is too small to hold all the captured substring offsets,
2144       as  much  as possible is filled in, and the function returns a value of
2145       zero. If captured substrings are not of interest, pcre2_match() may  be
2146       called with a match data block whose ovector is of minimum length (that
2147       is, one pair). However, if the pattern contains back references and the
2148       ovector is not big enough to remember the related substrings, PCRE2 has
2149       to get additional memory for use during matching. Thus  it  is  usually
2150       advisable to set up a match data block containing an ovector of reason‐
2151       able size.
2152
2153       It is possible for capturing subpattern number n+1 to match  some  part
2154       of the subject when subpattern n has not been used at all. For example,
2155       if the string "abc" is matched  against  the  pattern  (a|(z))(bc)  the
2156       return from the function is 4, and subpatterns 1 and 3 are matched, but
2157       2 is not. When this happens, both values in  the  offset  pairs  corre‐
2158       sponding to unused subpatterns are set to PCRE2_UNSET.
2159
2160       Offset  values  that correspond to unused subpatterns at the end of the
2161       expression are also set to PCRE2_UNSET.  For  example,  if  the  string
2162       "abc" is matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3
2163       are not matched.  The return from the function is 2, because the  high‐
2164       est used capturing subpattern number is 1. The offsets for for the sec‐
2165       ond and third capturing  subpatterns  (assuming  the  vector  is  large
2166       enough, of course) are set to PCRE2_UNSET.
2167
2168       Elements in the ovector that do not correspond to capturing parentheses
2169       in the pattern are never changed. That is, if a pattern contains n cap‐
2170       turing parentheses, no more than ovector[0] to ovector[2n+1] are set by
2171       pcre2_match(). The other elements retain whatever  values  they  previ‐
2172       ously had.
2173

OTHER INFORMATION ABOUT A MATCH

2175
2176       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
2177
2178       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
2179
2180       As  well as the offsets in the ovector, other information about a match
2181       is retained in the match data block and can be retrieved by  the  above
2182       functions  in  appropriate  circumstances.  If they are called at other
2183       times, the result is undefined.
2184
2185       After a successful match, a partial match (PCRE2_ERROR_PARTIAL),  or  a
2186       failure  to  match  (PCRE2_ERROR_NOMATCH), a (*MARK) name may be avail‐
2187       able, and pcre2_get_mark() can be called. It returns a pointer  to  the
2188       zero-terminated  name,  which is within the compiled pattern. Otherwise
2189       NULL is returned. The length of the (*MARK) name (excluding the  termi‐
2190       nating  zero)  is  stored  in the code unit that preceeds the name. You
2191       should use this instead of relying  on  the  terminating  zero  if  the
2192       (*MARK) name might contain a binary zero.
2193
2194       After a successful match, the (*MARK) name that is returned is the last
2195       one encountered on the matching path through the pattern. After  a  "no
2196       match"  or  a  partial  match,  the  last  encountered  (*MARK) name is
2197       returned. For example, consider this pattern:
2198
2199         ^(*MARK:A)((*MARK:B)a|b)c
2200
2201       When it matches "bc", the returned mark is A. The B mark is  "seen"  in
2202       the  first  branch of the group, but it is not on the matching path. On
2203       the other hand, when this pattern fails to  match  "bx",  the  returned
2204       mark is B.
2205
2206       After  a  successful  match, a partial match, or one of the invalid UTF
2207       errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar()  can
2208       be called. After a successful or partial match it returns the code unit
2209       offset of the character at which the match started. For  a  non-partial
2210       match,  this can be different to the value of ovector[0] if the pattern
2211       contains the \K escape sequence. After a partial match,  however,  this
2212       value  is  always the same as ovector[0] because \K does not affect the
2213       result of a partial match.
2214
2215       After a UTF check failure, pcre2_get_startchar() can be used to  obtain
2216       the code unit offset of the invalid UTF character. Details are given in
2217       the pcre2unicode page.
2218

ERROR RETURNS FROM pcre2_match()

2220
2221       If pcre2_match() fails, it returns a negative number. This can be  con‐
2222       verted  to a text string by calling the pcre2_get_error_message() func‐
2223       tion (see "Obtaining a textual error message" below).   Negative  error
2224       codes  are  also  returned  by other functions, and are documented with
2225       them. The codes are given names in the header file. If UTF checking  is
2226       in force and an invalid UTF subject string is detected, one of a number
2227       of UTF-specific negative error codes is returned. Details are given  in
2228       the  pcre2unicode  page. The following are the other errors that may be
2229       returned by pcre2_match():
2230
2231         PCRE2_ERROR_NOMATCH
2232
2233       The subject string did not match the pattern.
2234
2235         PCRE2_ERROR_PARTIAL
2236
2237       The subject string did not match, but it did match partially.  See  the
2238       pcre2partial documentation for details of partial matching.
2239
2240         PCRE2_ERROR_BADMAGIC
2241
2242       PCRE2 stores a 4-byte "magic number" at the start of the compiled code,
2243       to catch the case when it is passed a junk pointer. This is  the  error
2244       that is returned when the magic number is not present.
2245
2246         PCRE2_ERROR_BADMODE
2247
2248       This  error  is  given  when  a  pattern that was compiled by the 8-bit
2249       library is passed to a 16-bit  or  32-bit  library  function,  or  vice
2250       versa.
2251
2252         PCRE2_ERROR_BADOFFSET
2253
2254       The value of startoffset was greater than the length of the subject.
2255
2256         PCRE2_ERROR_BADOPTION
2257
2258       An unrecognized bit was set in the options argument.
2259
2260         PCRE2_ERROR_BADUTFOFFSET
2261
2262       The UTF code unit sequence that was passed as a subject was checked and
2263       found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but  the
2264       value  of startoffset did not point to the beginning of a UTF character
2265       or the end of the subject.
2266
2267         PCRE2_ERROR_CALLOUT
2268
2269       This error is never generated by pcre2_match() itself. It  is  provided
2270       for  use  by  callout  functions  that  want  to cause pcre2_match() or
2271       pcre2_callout_enumerate() to return a distinctive error code.  See  the
2272       pcre2callout documentation for details.
2273
2274         PCRE2_ERROR_INTERNAL
2275
2276       An  unexpected  internal error has occurred. This error could be caused
2277       by a bug in PCRE2 or by overwriting of the compiled pattern.
2278
2279         PCRE2_ERROR_JIT_BADOPTION
2280
2281       This error is returned when a pattern  that  was  successfully  studied
2282       using  JIT is being matched, but the matching mode (partial or complete
2283       match) does not correspond to any JIT compilation mode.  When  the  JIT
2284       fast  path  function  is used, this error may be also given for invalid
2285       options. See the pcre2jit documentation for more details.
2286
2287         PCRE2_ERROR_JIT_STACKLIMIT
2288
2289       This error is returned when a pattern  that  was  successfully  studied
2290       using  JIT  is being matched, but the memory available for the just-in-
2291       time processing stack is not large enough. See the pcre2jit  documenta‐
2292       tion for more details.
2293
2294         PCRE2_ERROR_MATCHLIMIT
2295
2296       The backtracking limit was reached.
2297
2298         PCRE2_ERROR_NOMEMORY
2299
2300       If  a  pattern  contains  back  references,  but the ovector is not big
2301       enough to remember the referenced substrings, PCRE2  gets  a  block  of
2302       memory at the start of matching to use for this purpose. There are some
2303       other special cases where extra memory is needed during matching.  This
2304       error is given when memory cannot be obtained.
2305
2306         PCRE2_ERROR_NULL
2307
2308       Either the code, subject, or match_data argument was passed as NULL.
2309
2310         PCRE2_ERROR_RECURSELOOP
2311
2312       This  error  is  returned  when  pcre2_match() detects a recursion loop
2313       within the pattern. Specifically, it means that either the  whole  pat‐
2314       tern or a subpattern has been called recursively for the second time at
2315       the same position in the subject  string.  Some  simple  patterns  that
2316       might  do  this are detected and faulted at compile time, but more com‐
2317       plicated cases, in particular mutual recursions between  two  different
2318       subpatterns, cannot be detected until matching is attempted.
2319
2320         PCRE2_ERROR_RECURSIONLIMIT
2321
2322       The internal recursion limit was reached.
2323

OBTAINING A TEXTUAL ERROR MESSAGE

2325
2326       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
2327         PCRE2_SIZE bufflen);
2328
2329       A  text  message  for  an  error code from any PCRE2 function (compile,
2330       match, or auxiliary) can be obtained  by  calling  pcre2_get_error_mes‐
2331       sage().  The  code  is passed as the first argument, with the remaining
2332       two arguments specifying a code unit buffer and its length, into  which
2333       the  text  message is placed. Note that the message is returned in code
2334       units of the appropriate width for the library that is being used.
2335
2336       The returned message is terminated with a trailing zero, and the  func‐
2337       tion  returns  the  number  of  code units used, excluding the trailing
2338       zero.  If  the  error  number  is  unknown,  the  negative  error  code
2339       PCRE2_ERROR_BADDATA  is  returned. If the buffer is too small, the mes‐
2340       sage is truncated (but still with a trailing zero),  and  the  negative
2341       error  code PCRE2_ERROR_NOMEMORY is returned.  None of the messages are
2342       very long; a buffer size of 120 code units is ample.
2343

EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

2345
2346       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
2347         uint32_t number, PCRE2_SIZE *length);
2348
2349       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
2350         uint32_t number, PCRE2_UCHAR *buffer,
2351         PCRE2_SIZE *bufflen);
2352
2353       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
2354         uint32_t number, PCRE2_UCHAR **bufferptr,
2355         PCRE2_SIZE *bufflen);
2356
2357       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2358
2359       Captured substrings can be accessed directly by using  the  ovector  as
2360       described above.  For convenience, auxiliary functions are provided for
2361       extracting  captured  substrings  as  new,  separate,   zero-terminated
2362       strings. A substring that contains a binary zero is correctly extracted
2363       and has a further zero added on the end, but  the  result  is  not,  of
2364       course, a C string.
2365
2366       The functions in this section identify substrings by number. The number
2367       zero refers to the entire matched substring, with higher numbers refer‐
2368       ring  to  substrings  captured by parenthesized groups. After a partial
2369       match, only substring zero is available.  An  attempt  to  extract  any
2370       other  substring  gives the error PCRE2_ERROR_PARTIAL. The next section
2371       describes similar functions for extracting captured substrings by name.
2372
2373       If a pattern uses the \K escape sequence within a  positive  assertion,
2374       the reported start of a successful match can be greater than the end of
2375       the match.  For example, if the pattern  (?=ab\K)  is  matched  against
2376       "ab",  the  start  and  end offset values for the match are 2 and 0. In
2377       this situation, calling these functions with a  zero  substring  number
2378       extracts a zero-length empty string.
2379
2380       You  can  find the length in code units of a captured substring without
2381       extracting it by calling pcre2_substring_length_bynumber().  The  first
2382       argument  is a pointer to the match data block, the second is the group
2383       number, and the third is a pointer to a variable into which the  length
2384       is  placed.  If  you just want to know whether or not the substring has
2385       been captured, you can pass the third argument as NULL.
2386
2387       The pcre2_substring_copy_bynumber() function  copies  a  captured  sub‐
2388       string  into  a supplied buffer, whereas pcre2_substring_get_bynumber()
2389       copies it into new memory, obtained using the  same  memory  allocation
2390       function  that  was  used for the match data block. The first two argu‐
2391       ments of these functions are a pointer to the match data  block  and  a
2392       capturing group number.
2393
2394       The final arguments of pcre2_substring_copy_bynumber() are a pointer to
2395       the buffer and a pointer to a variable that contains its length in code
2396       units.  This is updated to contain the actual number of code units used
2397       for the extracted substring, excluding the terminating zero.
2398
2399       For pcre2_substring_get_bynumber() the third and fourth arguments point
2400       to  variables that are updated with a pointer to the new memory and the
2401       number of code units that comprise the substring, again  excluding  the
2402       terminating  zero.  When  the substring is no longer needed, the memory
2403       should be freed by calling pcre2_substring_free().
2404
2405       The return value from all these functions is zero  for  success,  or  a
2406       negative  error  code.  If  the pattern match failed, the match failure
2407       code is returned.  If a substring number  greater  than  zero  is  used
2408       after  a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible
2409       error codes are:
2410
2411         PCRE2_ERROR_NOMEMORY
2412
2413       The buffer was too small for  pcre2_substring_copy_bynumber(),  or  the
2414       attempt to get memory failed for pcre2_substring_get_bynumber().
2415
2416         PCRE2_ERROR_NOSUBSTRING
2417
2418       There  is  no  substring  with that number in the pattern, that is, the
2419       number is greater than the number of capturing parentheses.
2420
2421         PCRE2_ERROR_UNAVAILABLE
2422
2423       The substring number, though not greater than the number of captures in
2424       the pattern, is greater than the number of slots in the ovector, so the
2425       substring could not be captured.
2426
2427         PCRE2_ERROR_UNSET
2428
2429       The substring did not participate in the match.  For  example,  if  the
2430       pattern  is  (abc)|(def) and the subject is "def", and the ovector con‐
2431       tains at least two capturing slots, substring number 1 is unset.
2432

EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

2434
2435       int pcre2_substring_list_get(pcre2_match_data *match_data,
2436         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
2437
2438       void pcre2_substring_list_free(PCRE2_SPTR *list);
2439
2440       The pcre2_substring_list_get() function  extracts  all  available  sub‐
2441       strings  and  builds  a  list of pointers to them. It also (optionally)
2442       builds a second list that  contains  their  lengths  (in  code  units),
2443       excluding a terminating zero that is added to each of them. All this is
2444       done in a single block of memory that is obtained using the same memory
2445       allocation function that was used to get the match data block.
2446
2447       This  function  must be called only after a successful match. If called
2448       after a partial match, the error code PCRE2_ERROR_PARTIAL is returned.
2449
2450       The address of the memory block is returned via listptr, which is  also
2451       the start of the list of string pointers. The end of the list is marked
2452       by a NULL pointer. The address of the list of lengths is  returned  via
2453       lengthsptr.  If your strings do not contain binary zeros and you do not
2454       therefore need the lengths, you may supply NULL as the lengthsptr argu‐
2455       ment  to  disable  the  creation of a list of lengths. The yield of the
2456       function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the  mem‐
2457       ory  block could not be obtained. When the list is no longer needed, it
2458       should be freed by calling pcre2_substring_list_free().
2459
2460       If this function encounters a substring that is unset, which can happen
2461       when  capturing subpattern number n+1 matches some part of the subject,
2462       but subpattern n has not been used at all, it returns an empty  string.
2463       This  can  be  distinguished  from  a  genuine zero-length substring by
2464       inspecting  the  appropriate  offset  in  the  ovector,  which  contain
2465       PCRE2_UNSET   for   unset   substrings,   or   by   calling  pcre2_sub‐
2466       string_length_bynumber().
2467

EXTRACTING CAPTURED SUBSTRINGS BY NAME

2469
2470       int pcre2_substring_number_from_name(const pcre2_code *code,
2471         PCRE2_SPTR name);
2472
2473       int pcre2_substring_length_byname(pcre2_match_data *match_data,
2474         PCRE2_SPTR name, PCRE2_SIZE *length);
2475
2476       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
2477         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
2478
2479       int pcre2_substring_get_byname(pcre2_match_data *match_data,
2480         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
2481
2482       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2483
2484       To extract a substring by name, you first have to find associated  num‐
2485       ber.  For example, for this pattern:
2486
2487         (a+)b(?<xxx>\d+)...
2488
2489       the number of the subpattern called "xxx" is 2. If the name is known to
2490       be unique (PCRE2_DUPNAMES was not set), you can find  the  number  from
2491       the name by calling pcre2_substring_number_from_name(). The first argu‐
2492       ment is the compiled pattern, and the second is the name. The yield  of
2493       the function is the subpattern number, PCRE2_ERROR_NOSUBSTRING if there
2494       is no subpattern of  that  name,  or  PCRE2_ERROR_NOUNIQUESUBSTRING  if
2495       there  is  more than one subpattern of that name. Given the number, you
2496       can extract the  substring  directly,  or  use  one  of  the  functions
2497       described above.
2498
2499       For  convenience,  there are also "byname" functions that correspond to
2500       the "bynumber" functions, the only difference  being  that  the  second
2501       argument  is  a  name instead of a number. If PCRE2_DUPNAMES is set and
2502       there are duplicate names, these functions scan all the groups with the
2503       given name, and return the first named string that is set.
2504
2505       If  there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
2506       returned. If all groups with the name have  numbers  that  are  greater
2507       than  the  number  of  slots in the ovector, PCRE2_ERROR_UNAVAILABLE is
2508       returned. If there is at least one group with a slot  in  the  ovector,
2509       but no group is found to be set, PCRE2_ERROR_UNSET is returned.
2510
2511       Warning: If the pattern uses the (?| feature to set up multiple subpat‐
2512       terns with the same number, as described in the  section  on  duplicate
2513       subpattern  numbers  in  the pcre2pattern page, you cannot use names to
2514       distinguish the different subpatterns, because names are  not  included
2515       in  the compiled code. The matching process uses only numbers. For this
2516       reason, the use of different names for subpatterns of the  same  number
2517       causes an error at compile time.
2518

CREATING A NEW STRING WITH SUBSTITUTIONS

2520
2521       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
2522         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2523         uint32_t options, pcre2_match_data *match_data,
2524         pcre2_match_context *mcontext, PCRE2_SPTR replacement,
2525         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferfP,
2526         PCRE2_SIZE *outlengthptr);
2527
2528       This  function calls pcre2_match() and then makes a copy of the subject
2529       string in outputbuffer, replacing the part that was  matched  with  the
2530       replacement  string,  whose  length is supplied in rlength. This can be
2531       given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
2532       which  a  \K item in a lookahead in the pattern causes the match to end
2533       before it starts are not supported, and give rise to an error return.
2534
2535       The first seven arguments of pcre2_substitute() are  the  same  as  for
2536       pcre2_match(), except that the partial matching options are not permit‐
2537       ted, and match_data may be passed as NULL, in which case a  match  data
2538       block  is obtained and freed within this function, using memory manage‐
2539       ment functions from the match context, if provided, or else those  that
2540       were used to allocate memory for the compiled code.
2541
2542       The  outlengthptr  argument  must point to a variable that contains the
2543       length, in code units, of the output buffer. If the  function  is  suc‐
2544       cessful,  the value is updated to contain the length of the new string,
2545       excluding the trailing zero that is automatically added.
2546
2547       If the function is not  successful,  the  value  set  via  outlengthptr
2548       depends  on  the  type  of  error. For syntax errors in the replacement
2549       string, the value is the offset in the  replacement  string  where  the
2550       error  was  detected.  For  other  errors,  the value is PCRE2_UNSET by
2551       default. This includes the case of the output buffer being  too  small,
2552       unless  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  is  set (see below), in which
2553       case the value is the minimum length needed, including  space  for  the
2554       trailing  zero.  Note  that  in  order  to compute the required length,
2555       pcre2_substitute() has  to  simulate  all  the  matching  and  copying,
2556       instead of giving an error return as soon as the buffer overflows. Note
2557       also that the length is in code units, not bytes.
2558
2559       In the replacement string, which is interpreted as a UTF string in  UTF
2560       mode,  and  is  checked  for UTF validity unless the PCRE2_NO_UTF_CHECK
2561       option is set, a dollar character is an escape character that can spec‐
2562       ify  the insertion of characters from capturing groups or (*MARK) items
2563       in the pattern. The following forms are always recognized:
2564
2565         $$                  insert a dollar character
2566         $<n> or ${<n>}      insert the contents of group <n>
2567         $*MARK or ${*MARK}  insert the name of the last (*MARK) encountered
2568
2569       Either a group number or a group name  can  be  given  for  <n>.  Curly
2570       brackets  are  required only if the following character would be inter‐
2571       preted as part of the number or name. The number may be zero to include
2572       the  entire  matched  string.   For  example,  if  the pattern a(b)c is
2573       matched with "=abc=" and the replacement string "+$1$0$1+", the  result
2574       is "=+babcb+=".
2575
2576       The facility for inserting a (*MARK) name can be used to perform simple
2577       simultaneous substitutions, as this pcre2test example shows:
2578
2579         /(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
2580             apple lemon
2581          2: pear orange
2582
2583       As well as the usual options for pcre2_match(), a number of  additional
2584       options can be set in the options argument.
2585
2586       PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject
2587       string, replacing every matching substring. If this is  not  set,  only
2588       the  first matching substring is replaced. If any matched substring has
2589       zero length, after the substitution has happened, an attempt to find  a
2590       non-empty  match at the same position is performed. If this is not suc‐
2591       cessful, the current position is advanced by one character except  when
2592       CRLF  is  a  valid newline sequence and the next two characters are CR,
2593       LF. In this case, the current position is advanced by two characters.
2594
2595       PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when  the  output
2596       buffer is too small. The default action is to return PCRE2_ERROR_NOMEM‐
2597       ORY immediately. If this option  is  set,  however,  pcre2_substitute()
2598       continues to go through the motions of matching and substituting (with‐
2599       out, of course, writing anything) in order to compute the size of  buf‐
2600       fer  that  is  needed.  This  value is passed back via the outlengthptr
2601       variable,   with   the   result   of   the   function    still    being
2602       PCRE2_ERROR_NOMEMORY.
2603
2604       Passing  a  buffer  size  of zero is a permitted way of finding out how
2605       much memory is needed for given substitution. However, this  does  mean
2606       that the entire operation is carried out twice. Depending on the appli‐
2607       cation, it may be more efficient to allocate a large  buffer  and  free
2608       the   excess   afterwards,   instead  of  using  PCRE2_SUBSTITUTE_OVER‐
2609       FLOW_LENGTH.
2610
2611       PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references  to  capturing  groups
2612       that  do  not appear in the pattern to be treated as unset groups. This
2613       option should be used with care, because it means  that  a  typo  in  a
2614       group  name  or  number  no  longer  causes the PCRE2_ERROR_NOSUBSTRING
2615       error.
2616
2617       PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing  groups  (including
2618       unknown  groups  when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)  to  be
2619       treated as empty strings when inserted  as  described  above.  If  this
2620       option  is  not  set,  an  attempt  to insert an unset group causes the
2621       PCRE2_ERROR_UNSET error. This option does not  influence  the  extended
2622       substitution syntax described below.
2623
2624       PCRE2_SUBSTITUTE_EXTENDED  causes extra processing to be applied to the
2625       replacement string. Without this option, only the dollar  character  is
2626       special,  and  only  the  group insertion forms listed above are valid.
2627       When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
2628
2629       Firstly, backslash in a replacement string is interpreted as an  escape
2630       character. The usual forms such as \n or \x{ddd} can be used to specify
2631       particular character codes, and backslash followed by any  non-alphanu‐
2632       meric  character  quotes  that character. Extended quoting can be coded
2633       using \Q...\E, exactly as in pattern strings.
2634
2635       There are also four escape sequences for forcing the case  of  inserted
2636       letters.   The  insertion  mechanism has three states: no case forcing,
2637       force upper case, and force lower case. The escape sequences change the
2638       current state: \U and \L change to upper or lower case forcing, respec‐
2639       tively, and \E (when not terminating a \Q quoted sequence)  reverts  to
2640       no  case  forcing. The sequences \u and \l force the next character (if
2641       it is a letter) to upper or lower  case,  respectively,  and  then  the
2642       state automatically reverts to no case forcing. Case forcing applies to
2643       all inserted  characters, including those from captured groups and let‐
2644       ters within \Q...\E quoted sequences.
2645
2646       Note that case forcing sequences such as \U...\E do not nest. For exam‐
2647       ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc";  the  final
2648       \E has no effect.
2649
2650       The  second  effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
2651       flexibility to group substitution. The syntax is similar to  that  used
2652       by Bash:
2653
2654         ${<n>:-<string>}
2655         ${<n>:+<string1>:<string2>}
2656
2657       As  before,  <n> may be a group number or a name. The first form speci‐
2658       fies a default value. If group <n> is set, its value  is  inserted;  if
2659       not,  <string>  is  expanded  and  the result inserted. The second form
2660       specifies strings that are expanded and inserted when group <n> is  set
2661       or  unset,  respectively. The first form is just a convenient shorthand
2662       for
2663
2664         ${<n>:+${<n>}:<string>}
2665
2666       Backslash can be used to escape colons and closing  curly  brackets  in
2667       the  replacement  strings.  A change of the case forcing state within a
2668       replacement string remains  in  force  afterwards,  as  shown  in  this
2669       pcre2test example:
2670
2671         /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
2672             body
2673          1: hello
2674             somebody
2675          1: HELLO
2676
2677       The  PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
2678       substitutions.  However,  PCRE2_SUBSTITUTE_UNKNOWN_UNSET   does   cause
2679       unknown groups in the extended syntax forms to be treated as unset.
2680
2681       If  successful,  pcre2_substitute()  returns the number of replacements
2682       that were made. This may be zero if no matches were found, and is never
2683       greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
2684
2685       In the event of an error, a negative error code is returned. Except for
2686       PCRE2_ERROR_NOMATCH   (which   is   never   returned),   errors    from
2687       pcre2_match() are passed straight back.
2688
2689       PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser‐
2690       tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
2691
2692       PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ‐
2693       ing  an  unknown  substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set)
2694       when  the  simple  (non-extended)  syntax  is  used  and  PCRE2_SUBSTI‐
2695       TUTE_UNSET_EMPTY is not set.
2696
2697       PCRE2_ERROR_NOMEMORY  is  returned  if  the  output  buffer  is not big
2698       enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size
2699       of  buffer  that is needed is returned via outlengthptr. Note that this
2700       does not happen by default.
2701
2702       PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax  errors  in
2703       the   replacement   string,   with   more   particular   errors   being
2704       PCRE2_ERROR_BADREPESCAPE (invalid  escape  sequence),  PCRE2_ERROR_REP‐
2705       MISSING_BRACE  (closing curly bracket not found), PCRE2_BADSUBSTITUTION
2706       (syntax error in extended group substitution), and  PCRE2_BADSUBPATTERN
2707       (the  pattern  match ended before it started, which can happen if \K is
2708       used in an assertion).
2709
2710       As for all PCRE2 errors, a text message that describes the error can be
2711       obtained   by   calling  the  pcre2_get_error_message()  function  (see
2712       "Obtaining a textual error message" above).
2713

DUPLICATE SUBPATTERN NAMES

2715
2716       int pcre2_substring_nametable_scan(const pcre2_code *code,
2717         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
2718
2719       When a pattern is compiled with the PCRE2_DUPNAMES  option,  names  for
2720       subpatterns  are  not required to be unique. Duplicate names are always
2721       allowed for subpatterns with the same number, created by using the  (?|
2722       feature.  Indeed,  if  such subpatterns are named, they are required to
2723       use the same names.
2724
2725       Normally, patterns with duplicate names are such that in any one match,
2726       only  one of the named subpatterns participates. An example is shown in
2727       the pcre2pattern documentation.
2728
2729       When  duplicates   are   present,   pcre2_substring_copy_byname()   and
2730       pcre2_substring_get_byname()  return  the first substring corresponding
2731       to  the  given  name  that  is  set.  Only   if   none   are   set   is
2732       PCRE2_ERROR_UNSET  is  returned. The pcre2_substring_number_from_name()
2733       function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
2734       duplicate names.
2735
2736       If  you want to get full details of all captured substrings for a given
2737       name, you must use the pcre2_substring_nametable_scan()  function.  The
2738       first  argument is the compiled pattern, and the second is the name. If
2739       the third and fourth arguments are NULL, the function returns  a  group
2740       number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
2741
2742       When the third and fourth arguments are not NULL, they must be pointers
2743       to variables that are updated by the function. After it has  run,  they
2744       point to the first and last entries in the name-to-number table for the
2745       given name, and the function returns the length of each entry  in  code
2746       units.  In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
2747       no entries for the given name.
2748
2749       The format of the name table is described above in the section entitled
2750       Information  about  a  pattern.  Given all the relevant entries for the
2751       name, you can extract each of their numbers,  and  hence  the  captured
2752       data.
2753

FINDING ALL POSSIBLE MATCHES AT ONE POSITION

2755
2756       The  traditional  matching  function  uses a similar algorithm to Perl,
2757       which stops when it finds the first match at a given point in the  sub‐
2758       ject. If you want to find all possible matches, or the longest possible
2759       match at a given position,  consider  using  the  alternative  matching
2760       function  (see  below) instead. If you cannot use the alternative func‐
2761       tion, you can kludge it up by making use of the callout facility, which
2762       is described in the pcre2callout documentation.
2763
2764       What you have to do is to insert a callout right at the end of the pat‐
2765       tern.  When your callout function is called, extract and save the  cur‐
2766       rent  matched  substring.  Then return 1, which forces pcre2_match() to
2767       backtrack and try other alternatives. Ultimately, when it runs  out  of
2768       matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
2769

MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

2771
2772       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
2773         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2774         uint32_t options, pcre2_match_data *match_data,
2775         pcre2_match_context *mcontext,
2776         int *workspace, PCRE2_SIZE wscount);
2777
2778       The  function  pcre2_dfa_match()  is  called  to match a subject string
2779       against a compiled pattern, using a matching algorithm that  scans  the
2780       subject  string  just  once, and does not backtrack. This has different
2781       characteristics to the normal algorithm, and  is  not  compatible  with
2782       Perl.  Some of the features of PCRE2 patterns are not supported. Never‐
2783       theless, there are times when this kind of matching can be useful.  For
2784       a  discussion  of  the  two matching algorithms, and a list of features
2785       that pcre2_dfa_match() does not support, see the pcre2matching documen‐
2786       tation.
2787
2788       The  arguments  for  the pcre2_dfa_match() function are the same as for
2789       pcre2_match(), plus two extras. The ovector within the match data block
2790       is used in a different way, and this is described below. The other com‐
2791       mon arguments are used in the same way as for pcre2_match(),  so  their
2792       description is not repeated here.
2793
2794       The  two  additional  arguments provide workspace for the function. The
2795       workspace vector should contain at least 20 elements. It  is  used  for
2796       keeping  track  of  multiple  paths  through  the  pattern  tree.  More
2797       workspace is needed for patterns and subjects where there are a lot  of
2798       potential matches.
2799
2800       Here is an example of a simple call to pcre2_dfa_match():
2801
2802         int wspace[20];
2803         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
2804         int rc = pcre2_dfa_match(
2805           re,             /* result of pcre2_compile() */
2806           "some string",  /* the subject string */
2807           11,             /* the length of the subject string */
2808           0,              /* start at offset 0 in the subject */
2809           0,              /* default options */
2810           match_data,     /* the match data block */
2811           NULL,           /* a match context; NULL means use defaults */
2812           wspace,         /* working space vector */
2813           20);            /* number of elements (NOT size in bytes) */
2814
2815   Option bits for pcre_dfa_match()
2816
2817       The  unused  bits of the options argument for pcre2_dfa_match() must be
2818       zero. The only bits that may be set are  PCRE2_ANCHORED,  PCRE2_NOTBOL,
2819       PCRE2_NOTEOL,          PCRE2_NOTEMPTY,          PCRE2_NOTEMPTY_ATSTART,
2820       PCRE2_NO_UTF_CHECK,       PCRE2_PARTIAL_HARD,       PCRE2_PARTIAL_SOFT,
2821       PCRE2_DFA_SHORTEST,  and  PCRE2_DFA_RESTART.  All  but the last four of
2822       these are exactly the same as for pcre2_match(), so  their  description
2823       is not repeated here.
2824
2825         PCRE2_PARTIAL_HARD
2826         PCRE2_PARTIAL_SOFT
2827
2828       These  have  the  same general effect as they do for pcre2_match(), but
2829       the details are slightly different. When PCRE2_PARTIAL_HARD is set  for
2830       pcre2_dfa_match(),  it  returns  PCRE2_ERROR_PARTIAL  if the end of the
2831       subject is reached and there is still at least one matching possibility
2832       that requires additional characters. This happens even if some complete
2833       matches have already been found. When PCRE2_PARTIAL_SOFT  is  set,  the
2834       return  code  PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
2835       if the end of the subject is  reached,  there  have  been  no  complete
2836       matches, but there is still at least one matching possibility. The por‐
2837       tion of the string that was inspected when the  longest  partial  match
2838       was found is set as the first matching string in both cases. There is a
2839       more detailed discussion of partial and  multi-segment  matching,  with
2840       examples, in the pcre2partial documentation.
2841
2842         PCRE2_DFA_SHORTEST
2843
2844       Setting  the PCRE2_DFA_SHORTEST option causes the matching algorithm to
2845       stop as soon as it has found one match. Because of the way the alterna‐
2846       tive  algorithm  works, this is necessarily the shortest possible match
2847       at the first possible matching point in the subject string.
2848
2849         PCRE2_DFA_RESTART
2850
2851       When pcre2_dfa_match() returns a partial match, it is possible to  call
2852       it again, with additional subject characters, and have it continue with
2853       the same match. The PCRE2_DFA_RESTART option requests this action; when
2854       it  is  set,  the workspace and wscount options must reference the same
2855       vector as before because data about the match so far is  left  in  them
2856       after a partial match. There is more discussion of this facility in the
2857       pcre2partial documentation.
2858
2859   Successful returns from pcre2_dfa_match()
2860
2861       When pcre2_dfa_match() succeeds, it may have matched more than one sub‐
2862       string in the subject. Note, however, that all the matches from one run
2863       of the function start at the same point in  the  subject.  The  shorter
2864       matches  are all initial substrings of the longer matches. For example,
2865       if the pattern
2866
2867         <.*>
2868
2869       is matched against the string
2870
2871         This is <something> <something else> <something further> no more
2872
2873       the three matched strings are
2874
2875         <something> <something else> <something further>
2876         <something> <something else>
2877         <something>
2878
2879       On success, the yield of the function is a number  greater  than  zero,
2880       which  is  the  number  of  matched substrings. The offsets of the sub‐
2881       strings are returned in the ovector, and can be extracted by number  in
2882       the  same way as for pcre2_match(), but the numbers bear no relation to
2883       any capturing groups that may exist in the pattern, because DFA  match‐
2884       ing does not support group capture.
2885
2886       Calls  to  the  convenience  functions  that extract substrings by name
2887       return the error PCRE2_ERROR_DFA_UFUNC (unsupported function)  if  used
2888       after a DFA match. The convenience functions that extract substrings by
2889       number never return PCRE2_ERROR_NOSUBSTRING, and the meanings  of  some
2890       other errors are slightly different:
2891
2892         PCRE2_ERROR_UNAVAILABLE
2893
2894       The ovector is not big enough to include a slot for the given substring
2895       number.
2896
2897         PCRE2_ERROR_UNSET
2898
2899       There is a slot in the ovector  for  this  substring,  but  there  were
2900       insufficient matches to fill it.
2901
2902       The  matched  strings  are  stored  in  the ovector in reverse order of
2903       length; that is, the longest matching string is first.  If  there  were
2904       too  many matches to fit into the ovector, the yield of the function is
2905       zero, and the vector is filled with the longest matches.
2906
2907       NOTE: PCRE2's "auto-possessification" optimization usually  applies  to
2908       character  repeats at the end of a pattern (as well as internally). For
2909       example, the pattern "a\d+" is compiled as if it were "a\d++". For  DFA
2910       matching,  this  means  that  only  one possible match is found. If you
2911       really do want multiple matches in such cases, either use  an  ungreedy
2912       repeat  auch  as  "a\d+?"  or set the PCRE2_NO_AUTO_POSSESS option when
2913       compiling.
2914
2915   Error returns from pcre2_dfa_match()
2916
2917       The pcre2_dfa_match() function returns a negative number when it fails.
2918       Many  of  the  errors  are  the same as for pcre2_match(), as described
2919       above.  There are in addition the following errors that are specific to
2920       pcre2_dfa_match():
2921
2922         PCRE2_ERROR_DFA_UITEM
2923
2924       This  return  is  given  if pcre2_dfa_match() encounters an item in the
2925       pattern that it does not support, for instance, the use of \C in a  UTF
2926       mode or a back reference.
2927
2928         PCRE2_ERROR_DFA_UCOND
2929
2930       This  return  is given if pcre2_dfa_match() encounters a condition item
2931       that uses a back reference for the condition, or a test  for  recursion
2932       in a specific group. These are not supported.
2933
2934         PCRE2_ERROR_DFA_WSSIZE
2935
2936       This  return  is  given  if  pcre2_dfa_match() runs out of space in the
2937       workspace vector.
2938
2939         PCRE2_ERROR_DFA_RECURSE
2940
2941       When a recursive subpattern is processed, the matching  function  calls
2942       itself recursively, using private memory for the ovector and workspace.
2943       This error is given if the internal ovector is not large  enough.  This
2944       should be extremely rare, as a vector of size 1000 is used.
2945
2946         PCRE2_ERROR_DFA_BADRESTART
2947
2948       When  pcre2_dfa_match()  is  called  with the PCRE2_DFA_RESTART option,
2949       some plausibility checks are made on the  contents  of  the  workspace,
2950       which  should  contain data about the previous partial match. If any of
2951       these checks fail, this error is given.
2952

SEE ALSO

2954
2955       pcre2build(3),   pcre2callout(3),    pcre2demo(3),    pcre2matching(3),
2956       pcre2partial(3),    pcre2posix(3),    pcre2sample(3),    pcre2stack(3),
2957       pcre2unicode(3).
2958

AUTHOR

2960
2961       Philip Hazel
2962       University Computing Service
2963       Cambridge, England.
2964

REVISION

2966
2967       Last updated: 23 December 2016
2968       Copyright (c) 1997-2016 University of Cambridge.
2969
2970
2971
2972PCRE2 10.23                    24 December 2016                    PCRE2API(3)
Impressum