1PCRE2API(3)                Library Functions Manual                PCRE2API(3)
2
3
4

NAME

6       PCRE2 - Perl-compatible regular expressions (revised API)
7
8       #include <pcre2.h>
9
10       PCRE2  is  a  new API for PCRE, starting at release 10.0. This document
11       contains a description of all its native functions. See the pcre2 docu‐
12       ment for an overview of all the PCRE2 documentation.
13

PCRE2 NATIVE API BASIC FUNCTIONS

15
16       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
17         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
18         pcre2_compile_context *ccontext);
19
20       void pcre2_code_free(pcre2_code *code);
21
22       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
23         pcre2_general_context *gcontext);
24
25       pcre2_match_data *pcre2_match_data_create_from_pattern(
26         const pcre2_code *code, pcre2_general_context *gcontext);
27
28       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
29         PCRE2_SIZE length, PCRE2_SIZE startoffset,
30         uint32_t options, pcre2_match_data *match_data,
31         pcre2_match_context *mcontext);
32
33       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
34         PCRE2_SIZE length, PCRE2_SIZE startoffset,
35         uint32_t options, pcre2_match_data *match_data,
36         pcre2_match_context *mcontext,
37         int *workspace, PCRE2_SIZE wscount);
38
39       void pcre2_match_data_free(pcre2_match_data *match_data);
40

PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS

42
43       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
44
45       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
46
47       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
48
49       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
50

PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS

52
53       pcre2_general_context *pcre2_general_context_create(
54         void *(*private_malloc)(PCRE2_SIZE, void *),
55         void (*private_free)(void *, void *), void *memory_data);
56
57       pcre2_general_context *pcre2_general_context_copy(
58         pcre2_general_context *gcontext);
59
60       void pcre2_general_context_free(pcre2_general_context *gcontext);
61

PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS

63
64       pcre2_compile_context *pcre2_compile_context_create(
65         pcre2_general_context *gcontext);
66
67       pcre2_compile_context *pcre2_compile_context_copy(
68         pcre2_compile_context *ccontext);
69
70       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
71
72       int pcre2_set_bsr(pcre2_compile_context *ccontext,
73         uint32_t value);
74
75       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
76         const unsigned char *tables);
77
78       int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
79         uint32_t extra_options);
80
81       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
82         PCRE2_SIZE value);
83
84       int pcre2_set_newline(pcre2_compile_context *ccontext,
85         uint32_t value);
86
87       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
88         uint32_t value);
89
90       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
91         int (*guard_function)(uint32_t, void *), void *user_data);
92

PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

94
95       pcre2_match_context *pcre2_match_context_create(
96         pcre2_general_context *gcontext);
97
98       pcre2_match_context *pcre2_match_context_copy(
99         pcre2_match_context *mcontext);
100
101       void pcre2_match_context_free(pcre2_match_context *mcontext);
102
103       int pcre2_set_callout(pcre2_match_context *mcontext,
104         int (*callout_function)(pcre2_callout_block *, void *),
105         void *callout_data);
106
107       int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
108         int (*callout_function)(pcre2_substitute_callout_block *, void *),
109         void *callout_data);
110
111       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
112         PCRE2_SIZE value);
113
114       int pcre2_set_heap_limit(pcre2_match_context *mcontext,
115         uint32_t value);
116
117       int pcre2_set_match_limit(pcre2_match_context *mcontext,
118         uint32_t value);
119
120       int pcre2_set_depth_limit(pcre2_match_context *mcontext,
121         uint32_t value);
122

PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS

124
125       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
126         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
127
128       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
129         uint32_t number, PCRE2_UCHAR *buffer,
130         PCRE2_SIZE *bufflen);
131
132       void pcre2_substring_free(PCRE2_UCHAR *buffer);
133
134       int pcre2_substring_get_byname(pcre2_match_data *match_data,
135         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
136
137       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
138         uint32_t number, PCRE2_UCHAR **bufferptr,
139         PCRE2_SIZE *bufflen);
140
141       int pcre2_substring_length_byname(pcre2_match_data *match_data,
142         PCRE2_SPTR name, PCRE2_SIZE *length);
143
144       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
145         uint32_t number, PCRE2_SIZE *length);
146
147       int pcre2_substring_nametable_scan(const pcre2_code *code,
148         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
149
150       int pcre2_substring_number_from_name(const pcre2_code *code,
151         PCRE2_SPTR name);
152
153       void pcre2_substring_list_free(PCRE2_SPTR *list);
154
155       int pcre2_substring_list_get(pcre2_match_data *match_data,
156         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
157

PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION

159
160       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
161         PCRE2_SIZE length, PCRE2_SIZE startoffset,
162         uint32_t options, pcre2_match_data *match_data,
163         pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
164         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
165         PCRE2_SIZE *outlengthptr);
166

PCRE2 NATIVE API JIT FUNCTIONS

168
169       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
170
171       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
172         PCRE2_SIZE length, PCRE2_SIZE startoffset,
173         uint32_t options, pcre2_match_data *match_data,
174         pcre2_match_context *mcontext);
175
176       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
177
178       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
179         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
180
181       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
182         pcre2_jit_callback callback_function, void *callback_data);
183
184       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
185

PCRE2 NATIVE API SERIALIZATION FUNCTIONS

187
188       int32_t pcre2_serialize_decode(pcre2_code **codes,
189         int32_t number_of_codes, const uint8_t *bytes,
190         pcre2_general_context *gcontext);
191
192       int32_t pcre2_serialize_encode(const pcre2_code **codes,
193         int32_t number_of_codes, uint8_t **serialized_bytes,
194         PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
195
196       void pcre2_serialize_free(uint8_t *bytes);
197
198       int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
199

PCRE2 NATIVE API AUXILIARY FUNCTIONS

201
202       pcre2_code *pcre2_code_copy(const pcre2_code *code);
203
204       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
205
206       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
207         PCRE2_SIZE bufflen);
208
209       const unsigned char *pcre2_maketables(pcre2_general_context *gcontext);
210
211       int pcre2_pattern_info(const pcre2_code *code, uint32_t what,
212         void *where);
213
214       int pcre2_callout_enumerate(const pcre2_code *code,
215         int (*callback)(pcre2_callout_enumerate_block *, void *),
216         void *user_data);
217
218       int pcre2_config(uint32_t what, void *where);
219

PCRE2 NATIVE API OBSOLETE FUNCTIONS

221
222       int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
223         uint32_t value);
224
225       int pcre2_set_recursion_memory_management(
226         pcre2_match_context *mcontext,
227         void *(*private_malloc)(PCRE2_SIZE, void *),
228         void (*private_free)(void *, void *), void *memory_data);
229
230       These  functions became obsolete at release 10.30 and are retained only
231       for backward compatibility. They should not be used in  new  code.  The
232       first  is  replaced by pcre2_set_depth_limit(); the second is no longer
233       needed and has no effect (it always returns zero).
234

PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS

236
237       pcre2_convert_context *pcre2_convert_context_create(
238         pcre2_general_context *gcontext);
239
240       pcre2_convert_context *pcre2_convert_context_copy(
241         pcre2_convert_context *cvcontext);
242
243       void pcre2_convert_context_free(pcre2_convert_context *cvcontext);
244
245       int pcre2_set_glob_escape(pcre2_convert_context *cvcontext,
246         uint32_t escape_char);
247
248       int pcre2_set_glob_separator(pcre2_convert_context *cvcontext,
249         uint32_t separator_char);
250
251       int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length,
252         uint32_t options, PCRE2_UCHAR **buffer,
253         PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);
254
255       void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);
256
257       These functions provide a way of  converting  non-PCRE2  patterns  into
258       patterns  that  can  be  processed by pcre2_compile(). This facility is
259       experimental and may be changed in future releases. At present, "globs"
260       and  POSIX  basic  and  extended patterns can be converted. Details are
261       given in the pcre2convert documentation.
262

PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES

264
265       There are three PCRE2 libraries, supporting 8-bit, 16-bit,  and  32-bit
266       code  units,  respectively.  However,  there  is  just one header file,
267       pcre2.h.  This contains the function prototypes and  other  definitions
268       for all three libraries. One, two, or all three can be installed simul‐
269       taneously. On Unix-like systems the libraries  are  called  libpcre2-8,
270       libpcre2-16, and libpcre2-32, and they can also co-exist with the orig‐
271       inal PCRE libraries.
272
273       Character strings are passed to and from a PCRE2 library as a  sequence
274       of  unsigned  integers  in  code  units of the appropriate width. Every
275       PCRE2 function comes in three different forms, one  for  each  library,
276       for example:
277
278         pcre2_compile_8()
279         pcre2_compile_16()
280         pcre2_compile_32()
281
282       There are also three different sets of data types:
283
284         PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
285         PCRE2_SPTR8,  PCRE2_SPTR16,  PCRE2_SPTR32
286
287       The  UCHAR  types define unsigned code units of the appropriate widths.
288       For example, PCRE2_UCHAR16 is usually defined as `uint16_t'.  The  SPTR
289       types  are  constant  pointers  to the equivalent UCHAR types, that is,
290       they are pointers to vectors of unsigned code units.
291
292       Many applications use only one code unit width. For their  convenience,
293       macros are defined whose names are the generic forms such as pcre2_com‐
294       pile() and  PCRE2_SPTR.  These  macros  use  the  value  of  the  macro
295       PCRE2_CODE_UNIT_WIDTH  to generate the appropriate width-specific func‐
296       tion and macro names.  PCRE2_CODE_UNIT_WIDTH is not defined by default.
297       An  application  must  define  it  to  be 8, 16, or 32 before including
298       pcre2.h in order to make use of the generic names.
299
300       Applications that use more than one code unit width can be linked  with
301       more  than  one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to
302       be 0 before including pcre2.h, and then use the  real  function  names.
303       Any  code  that  is to be included in an environment where the value of
304       PCRE2_CODE_UNIT_WIDTH is unknown should  also  use  the  real  function
305       names. (Unfortunately, it is not possible in C code to save and restore
306       the value of a macro.)
307
308       If PCRE2_CODE_UNIT_WIDTH is not defined  before  including  pcre2.h,  a
309       compiler error occurs.
310
311       When  using  multiple  libraries  in an application, you must take care
312       when processing any particular pattern to use  only  functions  from  a
313       single  library.   For example, if you want to run a match using a pat‐
314       tern that was compiled with pcre2_compile_16(), you  must  do  so  with
315       pcre2_match_16(), not pcre2_match_8() or pcre2_match_32().
316
317       In  the  function summaries above, and in the rest of this document and
318       other PCRE2 documents, functions and data  types  are  described  using
319       their generic names, without the _8, _16, or _32 suffix.
320

PCRE2 API OVERVIEW

322
323       PCRE2  has  its  own  native  API, which is described in this document.
324       There are also some wrapper functions for the 8-bit library that corre‐
325       spond  to the POSIX regular expression API, but they do not give access
326       to all the functionality of PCRE2. They are described in the pcre2posix
327       documentation. Both these APIs define a set of C function calls.
328
329       The  native  API  C data types, function prototypes, option values, and
330       error codes are defined in the header file pcre2.h, which also contains
331       definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
332       numbers for the library. Applications can use these to include  support
333       for different releases of PCRE2.
334
335       In a Windows environment, if you want to statically link an application
336       program against a non-dll PCRE2 library, you must  define  PCRE2_STATIC
337       before including pcre2.h.
338
339       The  functions pcre2_compile() and pcre2_match() are used for compiling
340       and matching regular expressions in a Perl-compatible manner. A  sample
341       program that demonstrates the simplest way of using them is provided in
342       the file called pcre2demo.c in the PCRE2 source distribution. A listing
343       of  this  program  is  given  in  the  pcre2demo documentation, and the
344       pcre2sample documentation describes how to compile and run it.
345
346       The compiling and matching functions recognize various options that are
347       passed as bits in an options argument. There are also some more compli‐
348       cated  parameters  such  as  custom  memory  management  functions  and
349       resource  limits  that  are passed in "contexts" (which are just memory
350       blocks, described below). Simple applications do not need to  make  use
351       of contexts.
352
353       Just-in-time  (JIT)  compiler  support  is an optional feature of PCRE2
354       that can be built in  appropriate  hardware  environments.  It  greatly
355       speeds  up  the  matching  performance  of  many patterns. Programs can
356       request that it be used if  available  by  calling  pcre2_jit_compile()
357       after a pattern has been successfully compiled by pcre2_compile(). This
358       does nothing if JIT support is not available.
359
360       More complicated programs might need to  make  use  of  the  specialist
361       functions    pcre2_jit_stack_create(),    pcre2_jit_stack_free(),   and
362       pcre2_jit_stack_assign() in order to  control  the  JIT  code's  memory
363       usage.
364
365       JIT matching is automatically used by pcre2_match() if it is available,
366       unless the PCRE2_NO_JIT option is set. There is also a direct interface
367       for  JIT  matching,  which gives improved performance at the expense of
368       less sanity checking. The JIT-specific functions are discussed  in  the
369       pcre2jit documentation.
370
371       A  second  matching function, pcre2_dfa_match(), which is not Perl-com‐
372       patible, is also provided. This uses  a  different  algorithm  for  the
373       matching.  The  alternative  algorithm finds all possible matches (at a
374       given point in the subject), and scans the subject  just  once  (unless
375       there  are  lookaround  assertions).  However,  this algorithm does not
376       return captured substrings. A description of  the  two  matching  algo‐
377       rithms   and  their  advantages  and  disadvantages  is  given  in  the
378       pcre2matching   documentation.   There   is   no   JIT   support    for
379       pcre2_dfa_match().
380
381       In  addition  to  the  main compiling and matching functions, there are
382       convenience functions for extracting captured substrings from a subject
383       string that has been matched by pcre2_match(). They are:
384
385         pcre2_substring_copy_byname()
386         pcre2_substring_copy_bynumber()
387         pcre2_substring_get_byname()
388         pcre2_substring_get_bynumber()
389         pcre2_substring_list_get()
390         pcre2_substring_length_byname()
391         pcre2_substring_length_bynumber()
392         pcre2_substring_nametable_scan()
393         pcre2_substring_number_from_name()
394
395       pcre2_substring_free()  and  pcre2_substring_list_free()  are also pro‐
396       vided, to free memory used for extracted strings. If  either  of  these
397       functions  is called with a NULL argument, the function returns immedi‐
398       ately without doing anything.
399
400       The function pcre2_substitute() can be called to match  a  pattern  and
401       return  a  copy of the subject string with substitutions for parts that
402       were matched.
403
404       Functions whose names begin with pcre2_serialize_ are used  for  saving
405       compiled patterns on disc or elsewhere, and reloading them later.
406
407       Finally,  there  are functions for finding out information about a com‐
408       piled pattern (pcre2_pattern_info()) and about the  configuration  with
409       which PCRE2 was built (pcre2_config()).
410
411       Functions  with  names  ending with _free() are used for freeing memory
412       blocks of various sorts. In all cases, if one  of  these  functions  is
413       called with a NULL argument, it does nothing.
414

STRING LENGTHS AND OFFSETS

416
417       The  PCRE2  API  uses  string  lengths and offsets into strings of code
418       units in several places. These values are always  of  type  PCRE2_SIZE,
419       which  is an unsigned integer type, currently always defined as size_t.
420       The largest  value  that  can  be  stored  in  such  a  type  (that  is
421       ~(PCRE2_SIZE)0)  is reserved as a special indicator for zero-terminated
422       strings and unset offsets.  Therefore, the longest string that  can  be
423       handled is one less than this maximum.
424

NEWLINES

426
427       PCRE2 supports five different conventions for indicating line breaks in
428       strings: a single CR (carriage return) character, a  single  LF  (line‐
429       feed) character, the two-character sequence CRLF, any of the three pre‐
430       ceding, or any Unicode newline sequence. The Unicode newline  sequences
431       are  the  three just mentioned, plus the single characters VT (vertical
432       tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line
433       separator, U+2028), and PS (paragraph separator, U+2029).
434
435       Each  of  the first three conventions is used by at least one operating
436       system as its standard newline sequence. When PCRE2 is built, a default
437       can be specified.  If it is not, the default is set to LF, which is the
438       Unix standard. However, the newline convention can  be  changed  by  an
439       application  when  calling  pcre2_compile(),  or it can be specified by
440       special text at the start of the pattern  itself;  this  overrides  any
441       other  settings.  See  the pcre2pattern page for details of the special
442       character sequences.
443
444       In the PCRE2 documentation the word "newline"  is  used  to  mean  "the
445       character or pair of characters that indicate a line break". The choice
446       of newline convention affects the handling of the dot, circumflex,  and
447       dollar metacharacters, the handling of #-comments in /x mode, and, when
448       CRLF is a recognized line ending sequence, the match position  advance‐
449       ment for a non-anchored pattern. There is more detail about this in the
450       section on pcre2_match() options below.
451
452       The choice of newline convention does not affect the interpretation  of
453       the \n or \r escape sequences, nor does it affect what \R matches; this
454       has its own separate convention.
455

MULTITHREADING

457
458       In a multithreaded application it is important to keep  thread-specific
459       data  separate  from data that can be shared between threads. The PCRE2
460       library code itself is thread-safe: it contains  no  static  or  global
461       variables.  The  API  is  designed to be fairly simple for non-threaded
462       applications while at the same time ensuring that multithreaded  appli‐
463       cations can use it.
464
465       There are several different blocks of data that are used to pass infor‐
466       mation between the application and the PCRE2 libraries.
467
468   The compiled pattern
469
470       A pointer to the compiled form of a pattern is  returned  to  the  user
471       when pcre2_compile() is successful. The data in the compiled pattern is
472       fixed, and does not change when the pattern is matched.  Therefore,  it
473       is  thread-safe, that is, the same compiled pattern can be used by more
474       than one thread simultaneously. For example, an application can compile
475       all its patterns at the start, before forking off multiple threads that
476       use them. However, if the just-in-time (JIT)  optimization  feature  is
477       being  used,  it needs separate memory stack areas for each thread. See
478       the pcre2jit documentation for more details.
479
480       In a more complicated situation, where patterns are compiled only  when
481       they  are  first needed, but are still shared between threads, pointers
482       to compiled patterns must be protected  from  simultaneous  writing  by
483       multiple threads, at least until a pattern has been compiled. The logic
484       can be something like this:
485
486         Get a read-only (shared) lock (mutex) for pointer
487         if (pointer == NULL)
488           {
489           Get a write (unique) lock for pointer
490           pointer = pcre2_compile(...
491           }
492         Release the lock
493         Use pointer in pcre2_match()
494
495       Of course, testing for compilation errors should also  be  included  in
496       the code.
497
498       If JIT is being used, but the JIT compilation is not being done immedi‐
499       ately, (perhaps waiting to see if the pattern  is  used  often  enough)
500       similar logic is required. JIT compilation updates a pointer within the
501       compiled code block, so a thread must gain unique write access  to  the
502       pointer     before    calling    pcre2_jit_compile().    Alternatively,
503       pcre2_code_copy()  or  pcre2_code_copy_with_tables()  can  be  used  to
504       obtain  a private copy of the compiled code before calling the JIT com‐
505       piler.
506
507   Context blocks
508
509       The next main section below introduces the idea of "contexts" in  which
510       PCRE2 functions are called. A context is nothing more than a collection
511       of parameters that control the way PCRE2 operates. Grouping a number of
512       parameters together in a context is a convenient way of passing them to
513       a PCRE2 function without using lots of arguments. The  parameters  that
514       are  stored  in  contexts  are in some sense "advanced features" of the
515       API. Many straightforward applications will not need to use contexts.
516
517       In a multithreaded application, if the parameters in a context are val‐
518       ues  that  are  never  changed, the same context can be used by all the
519       threads. However, if any thread needs to change any value in a context,
520       it must make its own thread-specific copy.
521
522   Match blocks
523
524       The  matching  functions need a block of memory for storing the results
525       of a match. This includes details of what was matched, as well as addi‐
526       tional  information  such as the name of a (*MARK) setting. Each thread
527       must provide its own copy of this memory.
528

PCRE2 CONTEXTS

530
531       Some PCRE2 functions have a lot of parameters, many of which  are  used
532       only  by  specialist  applications,  for example, those that use custom
533       memory management or non-standard character tables.  To  keep  function
534       argument  lists  at a reasonable size, and at the same time to keep the
535       API extensible, "uncommon" parameters are passed to  certain  functions
536       in  a  context instead of directly. A context is just a block of memory
537       that holds the parameter values.  Applications  that  do  not  need  to
538       adjust  any  of  the  context  parameters  can pass NULL when a context
539       pointer is required.
540
541       There are three different types of context: a general context  that  is
542       relevant  for  several  PCRE2 operations, a compile-time context, and a
543       match-time context.
544
545   The general context
546
547       At present, this context just  contains  pointers  to  (and  data  for)
548       external  memory  management  functions  that  are  called from several
549       places in the PCRE2 library. The context is named `general' rather than
550       specifically  `memory'  because in future other fields may be added. If
551       you do not want to supply your own custom memory management  functions,
552       you  do not need to bother with a general context. A general context is
553       created by:
554
555       pcre2_general_context *pcre2_general_context_create(
556         void *(*private_malloc)(PCRE2_SIZE, void *),
557         void (*private_free)(void *, void *), void *memory_data);
558
559       The two function pointers specify custom memory  management  functions,
560       whose prototypes are:
561
562         void *private_malloc(PCRE2_SIZE, void *);
563         void  private_free(void *, void *);
564
565       Whenever code in PCRE2 calls these functions, the final argument is the
566       value of memory_data. Either of the first two arguments of the creation
567       function  may be NULL, in which case the system memory management func‐
568       tions malloc() and free() are used. (This is not currently  useful,  as
569       there  are  no  other  fields in a general context, but in future there
570       might be.)  The private_malloc() function  is  used  (if  supplied)  to
571       obtain  memory  for storing the context, and all three values are saved
572       as part of the context.
573
574       Whenever PCRE2 creates a data block of any kind, the block  contains  a
575       pointer  to the free() function that matches the malloc() function that
576       was used. When the time comes to  free  the  block,  this  function  is
577       called.
578
579       A general context can be copied by calling:
580
581       pcre2_general_context *pcre2_general_context_copy(
582         pcre2_general_context *gcontext);
583
584       The memory used for a general context should be freed by calling:
585
586       void pcre2_general_context_free(pcre2_general_context *gcontext);
587
588       If  this  function  is  passed  a NULL argument, it returns immediately
589       without doing anything.
590
591   The compile context
592
593       A compile context is required if you want to provide an external  func‐
594       tion  for  stack  checking  during compilation or to change the default
595       values of any of the following compile-time parameters:
596
597         What \R matches (Unicode newlines or CR, LF, CRLF only)
598         PCRE2's character tables
599         The newline character sequence
600         The compile time nested parentheses limit
601         The maximum length of the pattern string
602         The extra options bits (none set by default)
603
604       A compile context is also required if you are using custom memory  man‐
605       agement.   If  none of these apply, just pass NULL as the context argu‐
606       ment of pcre2_compile().
607
608       A compile context is created, copied, and freed by the following  func‐
609       tions:
610
611       pcre2_compile_context *pcre2_compile_context_create(
612         pcre2_general_context *gcontext);
613
614       pcre2_compile_context *pcre2_compile_context_copy(
615         pcre2_compile_context *ccontext);
616
617       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
618
619       A  compile  context  is created with default values for its parameters.
620       These can be changed by calling the following functions, which return 0
621       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
622
623       int pcre2_set_bsr(pcre2_compile_context *ccontext,
624         uint32_t value);
625
626       The  value  must  be PCRE2_BSR_ANYCRLF, to specify that \R matches only
627       CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R  matches  any
628       Unicode line ending sequence. The value is used by the JIT compiler and
629       by  the  two  interpreted   matching   functions,   pcre2_match()   and
630       pcre2_dfa_match().
631
632       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
633         const unsigned char *tables);
634
635       The  value  must  be  the result of a call to pcre2_maketables(), whose
636       only argument is a general context. This function builds a set of char‐
637       acter tables in the current locale.
638
639       int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
640         uint32_t extra_options);
641
642       As  PCRE2  has developed, almost all the 32 option bits that are avail‐
643       able in the options argument of pcre2_compile() have been used  up.  To
644       avoid  running  out, the compile context contains a set of extra option
645       bits which are used for some newer, assumed rarer, options. This  func‐
646       tion  sets  those bits. It always sets all the bits (either on or off).
647       It does not modify any existing  setting.  The  available  options  are
648       defined in the section entitled "Extra compile options" below.
649
650       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
651         PCRE2_SIZE value);
652
653       This  sets a maximum length, in code units, for any pattern string that
654       is compiled with this context. If the pattern is longer,  an  error  is
655       generated.   This facility is provided so that applications that accept
656       patterns from external sources can limit their size. The default is the
657       largest  number  that  a  PCRE2_SIZE variable can hold, which is effec‐
658       tively unlimited.
659
660       int pcre2_set_newline(pcre2_compile_context *ccontext,
661         uint32_t value);
662
663       This specifies which characters or character sequences are to be recog‐
664       nized  as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
665       return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
666       two-character  sequence  CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
667       of the above), PCRE2_NEWLINE_ANY (any  Unicode  newline  sequence),  or
668       PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero).
669
670       A pattern can override the value set in the compile context by starting
671       with a sequence such as (*CRLF). See the pcre2pattern page for details.
672
673       When   a   pattern   is   compiled   with   the    PCRE2_EXTENDED    or
674       PCRE2_EXTENDED_MORE option, the newline convention affects the recogni‐
675       tion of the end of internal comments starting  with  #.  The  value  is
676       saved  with the compiled pattern for subsequent use by the JIT compiler
677       and by  the  two  interpreted  matching  functions,  pcre2_match()  and
678       pcre2_dfa_match().
679
680       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
681         uint32_t value);
682
683       This  parameter  adjusts  the  limit,  set when PCRE2 is built (default
684       250), on the depth of parenthesis nesting  in  a  pattern.  This  limit
685       stops  rogue  patterns  using  up too much system stack when being com‐
686       piled. The limit applies to parentheses of all kinds, not just  captur‐
687       ing parentheses.
688
689       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
690         int (*guard_function)(uint32_t, void *), void *user_data);
691
692       There  is at least one application that runs PCRE2 in threads with very
693       limited system stack, where running out of stack is to  be  avoided  at
694       all  costs. The parenthesis limit above cannot take account of how much
695       stack is actually available during compilation. For  a  finer  control,
696       you  can  supply  a  function  that  is called whenever pcre2_compile()
697       starts to compile a parenthesized part of a pattern. This function  can
698       check  the  actual  stack  size  (or anything else that it wants to, of
699       course).
700
701       The first argument to the callout function gives the current  depth  of
702       nesting,  and  the second is user data that is set up by the last argu‐
703       ment  of  pcre2_set_compile_recursion_guard().  The  callout   function
704       should return zero if all is well, or non-zero to force an error.
705
706   The match context
707
708       A match context is required if you want to:
709
710         Set up a callout function
711         Set an offset limit for matching an unanchored pattern
712         Change the limit on the amount of heap used when matching
713         Change the backtracking match limit
714         Change the backtracking depth limit
715         Set custom memory management specifically for the match
716
717       If  none  of  these  apply,  just  pass NULL as the context argument of
718       pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().
719
720       A match context is created, copied, and freed by  the  following  func‐
721       tions:
722
723       pcre2_match_context *pcre2_match_context_create(
724         pcre2_general_context *gcontext);
725
726       pcre2_match_context *pcre2_match_context_copy(
727         pcre2_match_context *mcontext);
728
729       void pcre2_match_context_free(pcre2_match_context *mcontext);
730
731       A  match  context  is  created  with default values for its parameters.
732       These can be changed by calling the following functions, which return 0
733       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
734
735       int pcre2_set_callout(pcre2_match_context *mcontext,
736         int (*callout_function)(pcre2_callout_block *, void *),
737         void *callout_data);
738
739       This  sets  up a callout function for PCRE2 to call at specified points
740       during a matching operation. Details are given in the pcre2callout doc‐
741       umentation.
742
743       int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
744         int (*callout_function)(pcre2_substitute_callout_block *, void *),
745         void *callout_data);
746
747       This  sets up a callout function for PCRE2 to call after each substitu‐
748       tion made by pcre2_substitute(). Details are given in the section enti‐
749       tled "Creating a new string with substitutions" below.
750
751       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
752         PCRE2_SIZE value);
753
754       The  offset_limit  parameter  limits  how  far an unanchored search can
755       advance in the subject string. The default value  is  PCRE2_UNSET.  The
756       pcre2_match()      and      pcre2_dfa_match()      functions     return
757       PCRE2_ERROR_NOMATCH if a match with a starting point before or  at  the
758       given  offset  is  not  found. The pcre2_substitute() function makes no
759       more substitutions.
760
761       For example, if the pattern /abc/ is matched against "123abc"  with  an
762       offset  limit  less  than 3, the result is PCRE2_ERROR_NOMATCH. A match
763       can never be  found  if  the  startoffset  argument  of  pcre2_match(),
764       pcre2_dfa_match(),  or  pcre2_substitute()  is  greater than the offset
765       limit set in the match context.
766
767       When using this  facility,  you  must  set  the  PCRE2_USE_OFFSET_LIMIT
768       option when calling pcre2_compile() so that when JIT is in use, differ‐
769       ent code can be compiled. If a match  is  started  with  a  non-default
770       match  limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is gener‐
771       ated.
772
773       The offset limit facility can be used to track progress when  searching
774       large  subject  strings or to limit the extent of global substitutions.
775       See also the PCRE2_FIRSTLINE option, which requires a  match  to  start
776       before  or  at  the first newline that follows the start of matching in
777       the subject. If this is set with an offset limit, a match must occur in
778       the first line and also within the offset limit. In other words, which‐
779       ever limit comes first is used.
780
781       int pcre2_set_heap_limit(pcre2_match_context *mcontext,
782         uint32_t value);
783
784       The heap_limit parameter specifies, in units of kibibytes (1024 bytes),
785       the  maximum  amount  of heap memory that pcre2_match() may use to hold
786       backtracking information when running an interpretive match. This limit
787       also applies to pcre2_dfa_match(), which may use the heap when process‐
788       ing patterns with a lot of nested pattern recursion or  lookarounds  or
789       atomic groups. This limit does not apply to matching with the JIT opti‐
790       mization, which has  its  own  memory  control  arrangements  (see  the
791       pcre2jit  documentation for more details). If the limit is reached, the
792       negative error code  PCRE2_ERROR_HEAPLIMIT  is  returned.  The  default
793       limit  can be set when PCRE2 is built; if it is not, the default is set
794       very large and is essentially "unlimited".
795
796       A value for the heap limit may also be supplied by an item at the start
797       of a pattern of the form
798
799         (*LIMIT_HEAP=ddd)
800
801       where  ddd  is  a  decimal  number.  However, such a setting is ignored
802       unless ddd is less than the limit set by the  caller  of  pcre2_match()
803       or, if no such limit is set, less than the default.
804
805       The  pcre2_match() function starts out using a 20KiB vector on the sys‐
806       tem stack for recording backtracking points. The more nested backtrack‐
807       ing  points  there  are (that is, the deeper the search tree), the more
808       memory is needed.  Heap memory is used only if the  initial  vector  is
809       too small. If the heap limit is set to a value less than 21 (in partic‐
810       ular, zero) no heap memory will be used. In this  case,  only  patterns
811       that  do not have a lot of nested backtracking can be successfully pro‐
812       cessed.
813
814       Similarly, for pcre2_dfa_match(), a vector on the system stack is  used
815       when  processing pattern recursions, lookarounds, or atomic groups, and
816       only if this is not big enough is heap memory used. In this case,  too,
817       setting a value of zero disables the use of the heap.
818
819       int pcre2_set_match_limit(pcre2_match_context *mcontext,
820         uint32_t value);
821
822       The  match_limit  parameter  provides  a means of preventing PCRE2 from
823       using up too many computing resources when processing patterns that are
824       not going to match, but which have a very large number of possibilities
825       in their search trees. The classic  example  is  a  pattern  that  uses
826       nested unlimited repeats.
827
828       There  is an internal counter in pcre2_match() that is incremented each
829       time round its main matching loop. If  this  value  reaches  the  match
830       limit, pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT.
831       This has the effect of limiting the amount  of  backtracking  that  can
832       take place. For patterns that are not anchored, the count restarts from
833       zero for each position in the subject string. This limit  also  applies
834       to pcre2_dfa_match(), though the counting is done in a different way.
835
836       When  pcre2_match() is called with a pattern that was successfully pro‐
837       cessed by pcre2_jit_compile(), the way in which matching is executed is
838       entirely  different. However, there is still the possibility of runaway
839       matching that goes on for a very long  time,  and  so  the  match_limit
840       value  is  also used in this case (but in a different way) to limit how
841       long the matching can continue.
842
843       The default value for the limit can be set when  PCRE2  is  built;  the
844       default  default  is 10 million, which handles all but the most extreme
845       cases. A value for the match limit may also be supplied by an  item  at
846       the start of a pattern of the form
847
848         (*LIMIT_MATCH=ddd)
849
850       where  ddd  is  a  decimal  number.  However, such a setting is ignored
851       unless ddd is less than the limit set by the caller of pcre2_match() or
852       pcre2_dfa_match() or, if no such limit is set, less than the default.
853
854       int pcre2_set_depth_limit(pcre2_match_context *mcontext,
855         uint32_t value);
856
857       This   parameter   limits   the   depth   of   nested  backtracking  in
858       pcre2_match().  Each time a nested backtracking point is passed, a  new
859       memory "frame" is used to remember the state of matching at that point.
860       Thus, this parameter indirectly limits the amount  of  memory  that  is
861       used  in  a  match.  However,  because  the size of each memory "frame"
862       depends on the number of capturing parentheses, the actual memory limit
863       varies  from pattern to pattern. This limit was more useful in versions
864       before 10.30, where function recursion was used for backtracking.
865
866       The depth limit is not relevant, and is ignored, when matching is  done
867       using JIT compiled code. However, it is supported by pcre2_dfa_match(),
868       which uses it to limit the depth of nested internal recursive  function
869       calls  that implement atomic groups, lookaround assertions, and pattern
870       recursions. This limits, indirectly, the amount of system stack that is
871       used.  It  was  more useful in versions before 10.32, when stack memory
872       was used for local workspace vectors for recursive function calls. From
873       version  10.32,  only local variables are allocated on the stack and as
874       each call uses only a few hundred bytes, even a small stack can support
875       quite a lot of recursion.
876
877       If  the  depth  of  internal  recursive function calls is great enough,
878       local workspace vectors are allocated on the heap  from  version  10.32
879       onwards,  so  the depth limit also indirectly limits the amount of heap
880       memory that is used. A recursive pattern such as /(.(?2))((?1)|)/, when
881       matched  to a very long string using pcre2_dfa_match(), can use a great
882       deal of memory. However, it is probably  better  to  limit  heap  usage
883       directly by calling pcre2_set_heap_limit().
884
885       The  default  value for the depth limit can be set when PCRE2 is built;
886       if it is not, the default is set to the same value as the  default  for
887       the   match   limit.   If  the  limit  is  exceeded,  pcre2_match()  or
888       pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth
889       limit  may also be supplied by an item at the start of a pattern of the
890       form
891
892         (*LIMIT_DEPTH=ddd)
893
894       where ddd is a decimal number.  However,  such  a  setting  is  ignored
895       unless ddd is less than the limit set by the caller of pcre2_match() or
896       pcre2_dfa_match() or, if no such limit is set, less than the default.
897

CHECKING BUILD-TIME OPTIONS

899
900       int pcre2_config(uint32_t what, void *where);
901
902       The function pcre2_config() makes it possible for  a  PCRE2  client  to
903       discover  which  optional  features  have  been compiled into the PCRE2
904       library. The pcre2build documentation  has  more  details  about  these
905       optional features.
906
907       The  first  argument  for pcre2_config() specifies which information is
908       required. The second argument is a pointer to  memory  into  which  the
909       information  is  placed.  If  NULL  is passed, the function returns the
910       amount of memory that is needed  for  the  requested  information.  For
911       calls  that  return  numerical  values,  the  value  is  in bytes; when
912       requesting these values, where should point  to  appropriately  aligned
913       memory.  For calls that return strings, the required length is given in
914       code units, not counting the terminating zero.
915
916       When requesting information, the returned value from pcre2_config()  is
917       non-negative  on success, or the negative error code PCRE2_ERROR_BADOP‐
918       TION if the value in the first argument is not recognized. The  follow‐
919       ing information is available:
920
921         PCRE2_CONFIG_BSR
922
923       The  output  is a uint32_t integer whose value indicates what character
924       sequences the \R  escape  sequence  matches  by  default.  A  value  of
925       PCRE2_BSR_UNICODE  means  that  \R  matches  any  Unicode  line  ending
926       sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches  only  CR,
927       LF, or CRLF. The default can be overridden when a pattern is compiled.
928
929         PCRE2_CONFIG_COMPILED_WIDTHS
930
931       The  output  is a uint32_t integer whose lower bits indicate which code
932       unit widths were selected when PCRE2 was  built.  The  1-bit  indicates
933       8-bit  support, and the 2-bit and 4-bit indicate 16-bit and 32-bit sup‐
934       port, respectively.
935
936         PCRE2_CONFIG_DEPTHLIMIT
937
938       The output is a uint32_t integer that gives the default limit  for  the
939       depth  of  nested  backtracking in pcre2_match() or the depth of nested
940       recursions, lookarounds, and atomic groups in  pcre2_dfa_match().  Fur‐
941       ther details are given with pcre2_set_depth_limit() above.
942
943         PCRE2_CONFIG_HEAPLIMIT
944
945       The  output is a uint32_t integer that gives, in kibibytes, the default
946       limit  for  the  amount  of  heap  memory  used  by  pcre2_match()   or
947       pcre2_dfa_match().      Further      details     are     given     with
948       pcre2_set_heap_limit() above.
949
950         PCRE2_CONFIG_JIT
951
952       The output is a uint32_t integer that is set  to  one  if  support  for
953       just-in-time compiling is available; otherwise it is set to zero.
954
955         PCRE2_CONFIG_JITTARGET
956
957       The  where  argument  should point to a buffer that is at least 48 code
958       units long.  (The  exact  length  required  can  be  found  by  calling
959       pcre2_config()  with  where  set  to NULL.) The buffer is filled with a
960       string that contains the name of the architecture  for  which  the  JIT
961       compiler  is  configured,  for  example  "x86  32bit  (little  endian +
962       unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION  is
963       returned,  otherwise the number of code units used is returned. This is
964       the length of the string, plus one unit for the terminating zero.
965
966         PCRE2_CONFIG_LINKSIZE
967
968       The output is a uint32_t integer that contains the number of bytes used
969       for  internal  linkage  in  compiled regular expressions. When PCRE2 is
970       configured, the value can be set to 2, 3, or 4, with the default  being
971       2.  This is the value that is returned by pcre2_config(). However, when
972       the 16-bit library is compiled, a value of 3 is rounded up  to  4,  and
973       when  the  32-bit  library  is compiled, internal linkages always use 4
974       bytes, so the configured value is not relevant.
975
976       The default value of 2 for the 8-bit and 16-bit libraries is sufficient
977       for  all but the most massive patterns, since it allows the size of the
978       compiled pattern to be up to 65535  code  units.  Larger  values  allow
979       larger  regular  expressions to be compiled by those two libraries, but
980       at the expense of slower matching.
981
982         PCRE2_CONFIG_MATCHLIMIT
983
984       The output is a uint32_t integer that gives the default match limit for
985       pcre2_match().  Further  details are given with pcre2_set_match_limit()
986       above.
987
988         PCRE2_CONFIG_NEWLINE
989
990       The output is a uint32_t integer  whose  value  specifies  the  default
991       character  sequence that is recognized as meaning "newline". The values
992       are:
993
994         PCRE2_NEWLINE_CR       Carriage return (CR)
995         PCRE2_NEWLINE_LF       Linefeed (LF)
996         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
997         PCRE2_NEWLINE_ANY      Any Unicode line ending
998         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
999         PCRE2_NEWLINE_NUL      The NUL character (binary zero)
1000
1001       The default should normally correspond to  the  standard  sequence  for
1002       your operating system.
1003
1004         PCRE2_CONFIG_NEVER_BACKSLASH_C
1005
1006       The  output  is  a uint32_t integer that is set to one if the use of \C
1007       was permanently disabled when PCRE2 was built; otherwise it is  set  to
1008       zero.
1009
1010         PCRE2_CONFIG_PARENSLIMIT
1011
1012       The  output is a uint32_t integer that gives the maximum depth of nest‐
1013       ing of parentheses (of any kind) in a pattern. This limit is imposed to
1014       cap  the  amount of system stack used when a pattern is compiled. It is
1015       specified when PCRE2 is built; the default is 250. This limit does  not
1016       take  into  account  the  stack that may already be used by the calling
1017       application. For  finer  control  over  compilation  stack  usage,  see
1018       pcre2_set_compile_recursion_guard().
1019
1020         PCRE2_CONFIG_STACKRECURSE
1021
1022       This parameter is obsolete and should not be used in new code. The out‐
1023       put is a uint32_t integer that is always set to zero.
1024
1025         PCRE2_CONFIG_UNICODE_VERSION
1026
1027       The where argument should point to a buffer that is at  least  24  code
1028       units  long.  (The  exact  length  required  can  be  found  by calling
1029       pcre2_config() with where set to NULL.)  If  PCRE2  has  been  compiled
1030       without  Unicode  support,  the buffer is filled with the text "Unicode
1031       not supported". Otherwise, the Unicode  version  string  (for  example,
1032       "8.0.0")  is  inserted. The number of code units used is returned. This
1033       is the length of the string plus one unit for the terminating zero.
1034
1035         PCRE2_CONFIG_UNICODE
1036
1037       The output is a uint32_t integer that is set to one if Unicode  support
1038       is  available; otherwise it is set to zero. Unicode support implies UTF
1039       support.
1040
1041         PCRE2_CONFIG_VERSION
1042
1043       The where argument should point to a buffer that is at  least  24  code
1044       units  long.  (The  exact  length  required  can  be  found  by calling
1045       pcre2_config() with where set to NULL.) The buffer is filled  with  the
1046       PCRE2 version string, zero-terminated. The number of code units used is
1047       returned. This is the length of the string plus one unit for the termi‐
1048       nating zero.
1049

COMPILING A PATTERN

1051
1052       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
1053         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
1054         pcre2_compile_context *ccontext);
1055
1056       void pcre2_code_free(pcre2_code *code);
1057
1058       pcre2_code *pcre2_code_copy(const pcre2_code *code);
1059
1060       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
1061
1062       The  pcre2_compile() function compiles a pattern into an internal form.
1063       The pattern is defined by a pointer to a string of  code  units  and  a
1064       length  (in  code units). If the pattern is zero-terminated, the length
1065       can be specified  as  PCRE2_ZERO_TERMINATED.  The  function  returns  a
1066       pointer  to  a  block  of memory that contains the compiled pattern and
1067       related data, or NULL if an error occurred.
1068
1069       If the compile context argument ccontext is NULL, memory for  the  com‐
1070       piled  pattern  is  obtained  by  calling  malloc().  Otherwise,  it is
1071       obtained from the same memory function that was used  for  the  compile
1072       context.  The  caller must free the memory by calling pcre2_code_free()
1073       when it is no longer needed.  If pcre2_code_free()  is  called  with  a
1074       NULL argument, it returns immediately, without doing anything.
1075
1076       The function pcre2_code_copy() makes a copy of the compiled code in new
1077       memory, using the same memory allocator as was used for  the  original.
1078       However,  if  the  code  has  been  processed  by the JIT compiler (see
1079       below), the JIT information cannot be copied (because it  is  position-
1080       dependent).  The new copy can initially be used only for non-JIT match‐
1081       ing, though it can be passed to  pcre2_jit_compile()  if  required.  If
1082       pcre2_code_copy() is called with a NULL argument, it returns NULL.
1083
1084       The pcre2_code_copy() function provides a way for individual threads in
1085       a multithreaded application to acquire a private copy  of  shared  com‐
1086       piled  code.   However, it does not make a copy of the character tables
1087       used by the compiled pattern; the new pattern code points to  the  same
1088       tables  as  the original code.  (See "Locale Support" below for details
1089       of these character tables.) In many applications the  same  tables  are
1090       used  throughout, so this behaviour is appropriate. Nevertheless, there
1091       are occasions when a copy of a compiled pattern and the relevant tables
1092       are  needed.  The pcre2_code_copy_with_tables() provides this facility.
1093       Copies of both the code and the tables are  made,  with  the  new  code
1094       pointing  to the new tables. The memory for the new tables is automati‐
1095       cally freed when pcre2_code_free() is called for the new  copy  of  the
1096       compiled  code.  If pcre2_code_copy_with_tables() is called with a NULL
1097       argument, it returns NULL.
1098
1099       NOTE: When one of the matching functions is  called,  pointers  to  the
1100       compiled pattern and the subject string are set in the match data block
1101       so that they can be referenced by the  substring  extraction  functions
1102       after  a  successful match.  After running a match, you must not free a
1103       compiled pattern or a subject string until after all operations on  the
1104       match  data  block have taken place, unless, in the case of the subject
1105       string, you have used the PCRE2_COPY_MATCHED_SUBJECT option,  which  is
1106       described  in  the  section  entitled  "Option  bits for pcre2_match()"
1107       below.
1108
1109       The options argument for pcre2_compile() contains various bit  settings
1110       that  affect  the  compilation.  It  should be zero if none of them are
1111       required. The available options are described below. Some of  them  (in
1112       particular,  those  that  are  compatible with Perl, but some others as
1113       well) can also be set and  unset  from  within  the  pattern  (see  the
1114       detailed description in the pcre2pattern documentation).
1115
1116       For  those options that can be different in different parts of the pat‐
1117       tern, the contents of the options argument specifies their settings  at
1118       the  start  of  compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and
1119       PCRE2_NO_UTF_CHECK options can be set at the time of matching  as  well
1120       as at compile time.
1121
1122       Some  additional  options  and  less  frequently  required compile-time
1123       parameters (for example, the newline setting) can be provided in a com‐
1124       pile context (as described above).
1125
1126       If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme‐
1127       diately. Otherwise, the variables to which these point are  set  to  an
1128       error  code  and  an  offset (number of code units) within the pattern,
1129       respectively, when pcre2_compile() returns NULL because  a  compilation
1130       error has occurred. The values are not defined when compilation is suc‐
1131       cessful and pcre2_compile() returns a non-NULL value.
1132
1133       There are nearly 100 positive  error  codes  that  pcre2_compile()  may
1134       return  if  it finds an error in the pattern. There are also some nega‐
1135       tive error codes that are used for invalid UTF strings. These  are  the
1136       same as given by pcre2_match() and pcre2_dfa_match(), and are described
1137       in the pcre2unicode page. There is no separate  documentation  for  the
1138       positive  error  codes,  because  the  textual  error messages that are
1139       obtained  by  calling  the  pcre2_get_error_message()   function   (see
1140       "Obtaining  a textual error message" below) should be self-explanatory.
1141       Macro names starting with PCRE2_ERROR_ are defined  for  both  positive
1142       and negative error codes in pcre2.h.
1143
1144       The value returned in erroroffset is an indication of where in the pat‐
1145       tern the error occurred. It is not necessarily the  furthest  point  in
1146       the  pattern  that  was  read. For example, after the error "lookbehind
1147       assertion is not fixed length", the error offset points to the start of
1148       the  failing assertion. For an invalid UTF-8 or UTF-16 string, the off‐
1149       set is that of the first code unit of the failing character.
1150
1151       Some errors are not detected until the whole pattern has been  scanned;
1152       in  these  cases,  the offset passed back is the length of the pattern.
1153       Note that the offset is in code units, not characters, even  in  a  UTF
1154       mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char‐
1155       acter.
1156
1157       This code fragment shows a typical straightforward call  to  pcre2_com‐
1158       pile():
1159
1160         pcre2_code *re;
1161         PCRE2_SIZE erroffset;
1162         int errorcode;
1163         re = pcre2_compile(
1164           "^A.*Z",                /* the pattern */
1165           PCRE2_ZERO_TERMINATED,  /* the pattern is zero-terminated */
1166           0,                      /* default options */
1167           &errorcode,             /* for error code */
1168           &erroffset,             /* for error offset */
1169           NULL);                  /* no compile context */
1170
1171
1172   Main compile options
1173
1174       The  following  names for option bits are defined in the pcre2.h header
1175       file:
1176
1177         PCRE2_ANCHORED
1178
1179       If this bit is set, the pattern is forced to be "anchored", that is, it
1180       is  constrained to match only at the first matching point in the string
1181       that is being searched (the "subject string"). This effect can also  be
1182       achieved  by appropriate constructs in the pattern itself, which is the
1183       only way to do it in Perl.
1184
1185         PCRE2_ALLOW_EMPTY_CLASS
1186
1187       By default, for compatibility with Perl, a closing square bracket  that
1188       immediately  follows  an opening one is treated as a data character for
1189       the class. When  PCRE2_ALLOW_EMPTY_CLASS  is  set,  it  terminates  the
1190       class, which therefore contains no characters and so can never match.
1191
1192         PCRE2_ALT_BSUX
1193
1194       This  option  request  alternative  handling of three escape sequences,
1195       which makes PCRE2's behaviour more like  ECMAscript  (aka  JavaScript).
1196       When it is set:
1197
1198       (1) \U matches an upper case "U" character; by default \U causes a com‐
1199       pile time error (Perl uses \U to upper case subsequent characters).
1200
1201       (2) \u matches a lower case "u" character unless it is followed by four
1202       hexadecimal  digits,  in  which case the hexadecimal number defines the
1203       code point to match. By default, \u causes a compile time  error  (Perl
1204       uses it to upper case the following character).
1205
1206       (3)  \x matches a lower case "x" character unless it is followed by two
1207       hexadecimal digits, in which case the hexadecimal  number  defines  the
1208       code  point  to  match. By default, as in Perl, a hexadecimal number is
1209       always expected after \x, but it may have zero, one, or two digits (so,
1210       for example, \xz matches a binary zero character followed by z).
1211
1212       ECMAscript 6 added additional functionality to \u. This can be accessed
1213       using  the  PCRE2_EXTRA_ALT_BSUX  extra  option  (see  "Extra   compile
1214       options"  below).   Note  that this alternative escape handling applies
1215       only to patterns. Neither of these options affects  the  processing  of
1216       replacement strings passed to pcre2_substitute().
1217
1218         PCRE2_ALT_CIRCUMFLEX
1219
1220       In  multiline  mode  (when  PCRE2_MULTILINE  is  set),  the  circumflex
1221       metacharacter matches at the start of the subject (unless  PCRE2_NOTBOL
1222       is  set),  and  also  after  any internal newline. However, it does not
1223       match after a newline at the end of the subject, for compatibility with
1224       Perl.  If  you want a multiline circumflex also to match after a termi‐
1225       nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
1226
1227         PCRE2_ALT_VERBNAMES
1228
1229       By default, for compatibility with Perl, the name in any verb  sequence
1230       such  as  (*MARK:NAME)  is  any  sequence  of  characters that does not
1231       include a closing parenthesis. The name is not processed  in  any  way,
1232       and  it  is  not possible to include a closing parenthesis in the name.
1233       However, if the PCRE2_ALT_VERBNAMES option  is  set,  normal  backslash
1234       processing  is  applied  to  verb  names  and only an unescaped closing
1235       parenthesis terminates the name. A closing parenthesis can be  included
1236       in  a  name either as \) or between \Q and \E. If the PCRE2_EXTENDED or
1237       PCRE2_EXTENDED_MORE option is set with  PCRE2_ALT_VERBNAMES,  unescaped
1238       whitespace  in  verb  names  is  skipped and #-comments are recognized,
1239       exactly as in the rest of the pattern.
1240
1241         PCRE2_AUTO_CALLOUT
1242
1243       If this bit  is  set,  pcre2_compile()  automatically  inserts  callout
1244       items,  all  with  number 255, before each pattern item, except immedi‐
1245       ately before or after an explicit callout in the pattern.  For  discus‐
1246       sion of the callout facility, see the pcre2callout documentation.
1247
1248         PCRE2_CASELESS
1249
1250       If  this  bit is set, letters in the pattern match both upper and lower
1251       case letters in the subject. It is equivalent to Perl's /i option,  and
1252       it  can  be  changed  within  a  pattern  by  a (?i) option setting. If
1253       PCRE2_UTF is set, Unicode properties are used for all  characters  with
1254       more  than one other case, and for all characters whose code points are
1255       greater than U+007F. For lower valued characters with  only  one  other
1256       case,  a  lookup  table is used for speed. When PCRE2_UTF is not set, a
1257       lookup table is used for all code points less than 256, and higher code
1258       points  (available  only  in  16-bit or 32-bit mode) are treated as not
1259       having another case.
1260
1261         PCRE2_DOLLAR_ENDONLY
1262
1263       If this bit is set, a dollar metacharacter in the pattern matches  only
1264       at  the  end  of the subject string. Without this option, a dollar also
1265       matches immediately before a newline at the end of the string (but  not
1266       before  any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored
1267       if PCRE2_MULTILINE is set. There is no equivalent  to  this  option  in
1268       Perl, and no way to set it within a pattern.
1269
1270         PCRE2_DOTALL
1271
1272       If  this  bit  is  set,  a dot metacharacter in the pattern matches any
1273       character, including one that indicates a  newline.  However,  it  only
1274       ever matches one character, even if newlines are coded as CRLF. Without
1275       this option, a dot does not match when the current position in the sub‐
1276       ject  is  at  a newline. This option is equivalent to Perl's /s option,
1277       and it can be changed within a pattern by a (?s) option setting. A neg‐
1278       ative  class such as [^a] always matches newline characters, and the \N
1279       escape sequence always matches a non-newline character, independent  of
1280       the setting of PCRE2_DOTALL.
1281
1282         PCRE2_DUPNAMES
1283
1284       If  this  bit is set, names used to identify capture groups need not be
1285       unique.  This can be helpful for certain types of pattern  when  it  is
1286       known  that  only  one instance of the named group can ever be matched.
1287       There are more details of named capture  groups  below;  see  also  the
1288       pcre2pattern documentation.
1289
1290         PCRE2_ENDANCHORED
1291
1292       If  this  bit is set, the end of any pattern match must be right at the
1293       end of the string being searched (the "subject string"). If the pattern
1294       match succeeds by reaching (*ACCEPT), but does not reach the end of the
1295       subject, the match fails at the current starting point. For  unanchored
1296       patterns,  a  new  match is then tried at the next starting point. How‐
1297       ever, if the match succeeds by reaching the end of the pattern, but not
1298       the  end  of  the subject, backtracking occurs and an alternative match
1299       may be found. Consider these two patterns:
1300
1301         .(*ACCEPT)|..
1302         .|..
1303
1304       If matched against "abc" with PCRE2_ENDANCHORED set, the first  matches
1305       "c"  whereas  the  second matches "bc". The effect of PCRE2_ENDANCHORED
1306       can also be achieved by appropriate constructs in the  pattern  itself,
1307       which is the only way to do it in Perl.
1308
1309       For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only
1310       to the first (that is, the  longest)  matched  string.  Other  parallel
1311       matches,  which are necessarily substrings of the first one, must obvi‐
1312       ously end before the end of the subject.
1313
1314         PCRE2_EXTENDED
1315
1316       If this bit is set, most white space  characters  in  the  pattern  are
1317       totally  ignored  except when escaped or inside a character class. How‐
1318       ever, white space is not allowed within  sequences  such  as  (?>  that
1319       introduce  various  parenthesized  groups, nor within numerical quanti‐
1320       fiers such as {1,3}. Ignorable white space is permitted between an item
1321       and  a  following quantifier and between a quantifier and a following +
1322       that indicates possessiveness. PCRE2_EXTENDED is equivalent  to  Perl's
1323       /x option, and it can be changed within a pattern by a (?x) option set‐
1324       ting.
1325
1326       When PCRE2 is compiled without Unicode support,  PCRE2_EXTENDED  recog‐
1327       nizes  as  white space only those characters with code points less than
1328       256 that are flagged as white space in its low-character table. The ta‐
1329       ble is normally created by pcre2_maketables(), which uses the isspace()
1330       function to identify space characters. In most ASCII environments,  the
1331       relevant  characters  are  those  with code points 0x0009 (tab), 0x000A
1332       (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D  (carriage
1333       return), and 0x0020 (space).
1334
1335       When PCRE2 is compiled with Unicode support, in addition to these char‐
1336       acters, five more Unicode "Pattern White Space" characters  are  recog‐
1337       nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to-
1338       right mark), U+200F (right-to-left mark), U+2028 (line separator),  and
1339       U+2029  (paragraph  separator).  This  set of characters is the same as
1340       recognized by Perl's /x option. Note that the horizontal  and  vertical
1341       space  characters that are matched by the \h and \v escapes in patterns
1342       are a much bigger set.
1343
1344       As well as ignoring most white space, PCRE2_EXTENDED also causes  char‐
1345       acters  between  an  unescaped # outside a character class and the next
1346       newline, inclusive, to be ignored, which makes it possible  to  include
1347       comments inside complicated patterns. Note that the end of this type of
1348       comment is a literal newline sequence in the pattern; escape  sequences
1349       that happen to represent a newline do not count.
1350
1351       Which characters are interpreted as newlines can be specified by a set‐
1352       ting in the compile context that is passed to pcre2_compile() or  by  a
1353       special  sequence at the start of the pattern, as described in the sec‐
1354       tion entitled "Newline conventions" in the pcre2pattern  documentation.
1355       A default is defined when PCRE2 is built.
1356
1357         PCRE2_EXTENDED_MORE
1358
1359       This  option  has  the  effect  of  PCRE2_EXTENDED,  but,  in addition,
1360       unescaped space and horizontal tab  characters  are  ignored  inside  a
1361       character  class.  Note: only these two characters are ignored, not the
1362       full set of pattern white space characters that are ignored  outside  a
1363       character  class.  PCRE2_EXTENDED_MORE  is  equivalent  to  Perl's  /xx
1364       option, and it can be changed within a pattern by a (?xx)  option  set‐
1365       ting.
1366
1367         PCRE2_FIRSTLINE
1368
1369       If this option is set, the start of an unanchored pattern match must be
1370       before or at the first newline in  the  subject  string  following  the
1371       start  of  matching, though the matched text may continue over the new‐
1372       line. If startoffset is non-zero, the limiting newline is not necessar‐
1373       ily  the  first  newline  in  the  subject. For example, if the subject
1374       string is "abc\nxyz" (where \n represents a single-character newline) a
1375       pattern  match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset is
1376       greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a  more
1377       general  limiting  facility.  If  PCRE2_FIRSTLINE is set with an offset
1378       limit, a match must occur in the first line and also within the  offset
1379       limit. In other words, whichever limit comes first is used.
1380
1381         PCRE2_LITERAL
1382
1383       If this option is set, all meta-characters in the pattern are disabled,
1384       and it is treated as a literal string. Matching literal strings with  a
1385       regular expression engine is not the most efficient way of doing it. If
1386       you are doing a lot of literal matching and  are  worried  about  effi‐
1387       ciency, you should consider using other approaches. The only other main
1388       options  that  are  allowed  with  PCRE2_LITERAL  are:  PCRE2_ANCHORED,
1389       PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE,
1390       PCRE2_NO_START_OPTIMIZE,     PCRE2_NO_UTF_CHECK,     PCRE2_UTF,     and
1391       PCRE2_USE_OFFSET_LIMIT.  The  extra  options PCRE2_EXTRA_MATCH_LINE and
1392       PCRE2_EXTRA_MATCH_WORD are also supported. Any other options  cause  an
1393       error.
1394
1395         PCRE2_MATCH_UNSET_BACKREF
1396
1397       If  this  option  is  set,  a  backreference  to an unset capture group
1398       matches an empty string (by default this causes  the  current  matching
1399       alternative  to  fail).   A  pattern such as (\1)(a) succeeds when this
1400       option is set (assuming it can find an "a" in the subject), whereas  it
1401       fails  by  default,  for  Perl compatibility. Setting this option makes
1402       PCRE2 behave more like ECMAscript (aka JavaScript).
1403
1404         PCRE2_MULTILINE
1405
1406       By default, for the purposes of matching "start of line"  and  "end  of
1407       line",  PCRE2  treats the subject string as consisting of a single line
1408       of characters, even if it actually contains  newlines.  The  "start  of
1409       line"  metacharacter  (^)  matches only at the start of the string, and
1410       the "end of line" metacharacter ($) matches only  at  the  end  of  the
1411       string,  or  before  a  terminating  newline  (except  when  PCRE2_DOL‐
1412       LAR_ENDONLY is set). Note, however, that unless  PCRE2_DOTALL  is  set,
1413       the "any character" metacharacter (.) does not match at a newline. This
1414       behaviour (for ^, $, and dot) is the same as Perl.
1415
1416       When PCRE2_MULTILINE it is set, the "start of line" and "end  of  line"
1417       constructs  match  immediately following or immediately before internal
1418       newlines in the subject string, respectively, as well as  at  the  very
1419       start  and  end.  This is equivalent to Perl's /m option, and it can be
1420       changed within a pattern by a (?m) option setting. Note that the "start
1421       of line" metacharacter does not match after a newline at the end of the
1422       subject, for compatibility with Perl.  However, you can change this  by
1423       setting  the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a
1424       subject string, or no occurrences of ^  or  $  in  a  pattern,  setting
1425       PCRE2_MULTILINE has no effect.
1426
1427         PCRE2_NEVER_BACKSLASH_C
1428
1429       This  option  locks out the use of \C in the pattern that is being com‐
1430       piled.  This escape can  cause  unpredictable  behaviour  in  UTF-8  or
1431       UTF-16  modes,  because  it may leave the current matching point in the
1432       middle of a multi-code-unit character. This option  may  be  useful  in
1433       applications  that  process  patterns  from external sources. Note that
1434       there is also a build-time option that permanently locks out the use of
1435       \C.
1436
1437         PCRE2_NEVER_UCP
1438
1439       This  option  locks  out the use of Unicode properties for handling \B,
1440       \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
1441       described  for  the  PCRE2_UCP option below. In particular, it prevents
1442       the creator of the pattern from enabling this facility by starting  the
1443       pattern  with  (*UCP).  This  option may be useful in applications that
1444       process patterns from external sources. The option combination PCRE_UCP
1445       and PCRE_NEVER_UCP causes an error.
1446
1447         PCRE2_NEVER_UTF
1448
1449       This  option  locks out interpretation of the pattern as UTF-8, UTF-16,
1450       or UTF-32, depending on which library is in use. In particular, it pre‐
1451       vents  the  creator of the pattern from switching to UTF interpretation
1452       by starting the pattern with (*UTF).  This  option  may  be  useful  in
1453       applications  that process patterns from external sources. The combina‐
1454       tion of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
1455
1456         PCRE2_NO_AUTO_CAPTURE
1457
1458       If this option is set, it disables the use of numbered capturing paren‐
1459       theses  in the pattern. Any opening parenthesis that is not followed by
1460       ? behaves as if it were followed by ?: but named parentheses can  still
1461       be used for capturing (and they acquire numbers in the usual way). This
1462       is the same as Perl's /n option.  Note that, when this option  is  set,
1463       references  to  capture  groups (backreferences or recursion/subroutine
1464       calls) may only refer to named groups, though the reference can  be  by
1465       name or by number.
1466
1467         PCRE2_NO_AUTO_POSSESS
1468
1469       If this option is set, it disables "auto-possessification", which is an
1470       optimization that, for example, turns a+b into a++b in order  to  avoid
1471       backtracks  into  a+ that can never be successful. However, if callouts
1472       are in use, auto-possessification means that some  callouts  are  never
1473       taken. You can set this option if you want the matching functions to do
1474       a full unoptimized search and run all the callouts, but  it  is  mainly
1475       provided for testing purposes.
1476
1477         PCRE2_NO_DOTSTAR_ANCHOR
1478
1479       If this option is set, it disables an optimization that is applied when
1480       .* is the first significant item in a top-level branch  of  a  pattern,
1481       and  all  the  other branches also start with .* or with \A or \G or ^.
1482       The optimization is automatically disabled for .* if it  is  inside  an
1483       atomic group or a capture group that is the subject of a backreference,
1484       or if the pattern contains (*PRUNE) or (*SKIP). When  the  optimization
1485       is   not   disabled,  such  a  pattern  is  automatically  anchored  if
1486       PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set
1487       for  any  ^ items. Otherwise, the fact that any match must start either
1488       at the start of the subject or following a newline is remembered.  Like
1489       other optimizations, this can cause callouts to be skipped.
1490
1491         PCRE2_NO_START_OPTIMIZE
1492
1493       This  is  an  option whose main effect is at matching time. It does not
1494       change what pcre2_compile() generates, but it does affect the output of
1495       the JIT compiler.
1496
1497       There  are  a  number of optimizations that may occur at the start of a
1498       match, in order to speed up the process. For example, if  it  is  known
1499       that  an  unanchored  match must start with a specific code unit value,
1500       the matching code searches the subject for that value, and fails  imme‐
1501       diately  if it cannot find it, without actually running the main match‐
1502       ing function. This means that a special item such as (*COMMIT)  at  the
1503       start  of  a  pattern is not considered until after a suitable starting
1504       point for the match has been found.  Also,  when  callouts  or  (*MARK)
1505       items  are  in use, these "start-up" optimizations can cause them to be
1506       skipped if the pattern is never actually used. The  start-up  optimiza‐
1507       tions  are  in effect a pre-scan of the subject that takes place before
1508       the pattern is run.
1509
1510       The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
1511       possibly  causing  performance  to  suffer,  but ensuring that in cases
1512       where the result is "no match", the callouts do occur, and  that  items
1513       such as (*COMMIT) and (*MARK) are considered at every possible starting
1514       position in the subject string.
1515
1516       Setting PCRE2_NO_START_OPTIMIZE may change the outcome  of  a  matching
1517       operation.  Consider the pattern
1518
1519         (*COMMIT)ABC
1520
1521       When  this  is compiled, PCRE2 records the fact that a match must start
1522       with the character "A". Suppose the subject  string  is  "DEFABC".  The
1523       start-up  optimization  scans along the subject, finds "A" and runs the
1524       first match attempt from there. The (*COMMIT) item means that the  pat‐
1525       tern  must  match the current starting position, which in this case, it
1526       does. However, if the same match is  run  with  PCRE2_NO_START_OPTIMIZE
1527       set,  the  initial  scan  along the subject string does not happen. The
1528       first match attempt is run starting  from  "D"  and  when  this  fails,
1529       (*COMMIT)  prevents  any  further  matches  being tried, so the overall
1530       result is "no match".
1531
1532       There are also other start-up optimizations.  For  example,  a  minimum
1533       length for the subject may be recorded. Consider the pattern
1534
1535         (*MARK:A)(X|Y)
1536
1537       The  minimum  length  for  a  match is one character. If the subject is
1538       "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
1539       to match an empty string at the end of the subject does not take place,
1540       because PCRE2 knows that the subject is  now  too  short,  and  so  the
1541       (*MARK)  is  never encountered. In this case, the optimization does not
1542       affect the overall match result, which is still "no match", but it does
1543       affect the auxiliary information that is returned.
1544
1545         PCRE2_NO_UTF_CHECK
1546
1547       When  PCRE2_UTF  is set, the validity of the pattern as a UTF string is
1548       automatically checked. There are  discussions  about  the  validity  of
1549       UTF-8  strings,  UTF-16 strings, and UTF-32 strings in the pcre2unicode
1550       document. If an invalid UTF sequence is found, pcre2_compile()  returns
1551       a negative error code.
1552
1553       If  you  know  that your pattern is a valid UTF string, and you want to
1554       skip  this  check  for   performance   reasons,   you   can   set   the
1555       PCRE2_NO_UTF_CHECK  option.  When  it  is set, the effect of passing an
1556       invalid UTF string as a pattern is undefined. It may cause your program
1557       to crash or loop.
1558
1559       Note  that  this  option  can  also  be  passed  to  pcre2_match()  and
1560       pcre_dfa_match(), to suppress UTF  validity  checking  of  the  subject
1561       string.
1562
1563       Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis‐
1564       able the error that is given if an escape sequence for an invalid  Uni‐
1565       code  code  point is encountered in the pattern. In particular, the so-
1566       called "surrogate" code points (0xd800 to 0xdfff) are invalid.  If  you
1567       want  to  allow  escape  sequences  such  as  \x{d800}  you can set the
1568       PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described  in  the
1569       section  entitled "Extra compile options" below.  However, this is pos‐
1570       sible only in UTF-8 and UTF-32 modes, because these values are not rep‐
1571       resentable in UTF-16.
1572
1573         PCRE2_UCP
1574
1575       This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
1576       \w, and some of the POSIX character classes.  By  default,  only  ASCII
1577       characters  are recognized, but if PCRE2_UCP is set, Unicode properties
1578       are used instead to classify characters. More details are given in  the
1579       section on generic character types in the pcre2pattern page. If you set
1580       PCRE2_UCP, matching one of the items it affects takes much longer.  The
1581       option  is  available only if PCRE2 has been compiled with Unicode sup‐
1582       port (which is the default).
1583
1584         PCRE2_UNGREEDY
1585
1586       This option inverts the "greediness" of the quantifiers  so  that  they
1587       are  not greedy by default, but become greedy if followed by "?". It is
1588       not compatible with Perl. It can also be set by a (?U)  option  setting
1589       within the pattern.
1590
1591         PCRE2_USE_OFFSET_LIMIT
1592
1593       This option must be set for pcre2_compile() if pcre2_set_offset_limit()
1594       is going to be used to set a non-default offset limit in a  match  con‐
1595       text  for  matches  that  use this pattern. An error is generated if an
1596       offset limit is set without this option.  For  more  details,  see  the
1597       description  of  pcre2_set_offset_limit() in the section that describes
1598       match contexts. See also the PCRE2_FIRSTLINE option above.
1599
1600         PCRE2_UTF
1601
1602       This option causes PCRE2 to regard both the  pattern  and  the  subject
1603       strings  that  are  subsequently processed as strings of UTF characters
1604       instead of single-code-unit strings. It  is  available  when  PCRE2  is
1605       built  to  include  Unicode  support (which is the default). If Unicode
1606       support is not available, the use of this  option  provokes  an  error.
1607       Details  of  how  PCRE2_UTF changes the behaviour of PCRE2 are given in
1608       the pcre2unicode page. In particular, note  that  it  changes  the  way
1609       PCRE2_CASELESS handles characters with code points greater than 127.
1610
1611   Extra compile options
1612
1613       The  option  bits  that  can be set in a compile context by calling the
1614       pcre2_set_compile_extra_options() function are as follows:
1615
1616         PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
1617
1618       This option applies when compiling a pattern in UTF-8 or  UTF-32  mode.
1619       It  is  forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode
1620       "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs
1621       in  UTF-16  to  encode  code points with values in the range 0x10000 to
1622       0x10ffff. The surrogates cannot therefore  be  represented  in  UTF-16.
1623       They can be represented in UTF-8 and UTF-32, but are defined as invalid
1624       code points, and cause errors if  encountered  in  a  UTF-8  or  UTF-32
1625       string that is being checked for validity by PCRE2.
1626
1627       These  values also cause errors if encountered in escape sequences such
1628       as \x{d912} within a pattern. However, it seems that some applications,
1629       when  using  PCRE2  to  check for unwanted characters in UTF-8 strings,
1630       explicitly  test  for  the  surrogates  using  escape  sequences.   The
1631       PCRE2_NO_UTF_CHECK  option  does  not  disable  the  error that occurs,
1632       because it applies only to the testing of input strings for UTF  valid‐
1633       ity.
1634
1635       If  the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro‐
1636       gate code point values in UTF-8 and UTF-32 patterns no  longer  provoke
1637       errors  and are incorporated in the compiled pattern. However, they can
1638       only match subject characters if the matching function is  called  with
1639       PCRE2_NO_UTF_CHECK set.
1640
1641         PCRE2_EXTRA_ALT_BSUX
1642
1643       The  original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and
1644       \x in the way that ECMAscript (aka JavaScript) does.  Additional  func‐
1645       tionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has
1646       the effect of PCRE2_ALT_BSUX, but in addition it  recognizes  \u{hhh..}
1647       as a hexadecimal character code, where hhh.. is any number of hexadeci‐
1648       mal digits.
1649
1650         PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
1651
1652       This is a dangerous option. Use with care. By default, an  unrecognized
1653       escape  such  as \j or a malformed one such as \x{2z} causes a compile-
1654       time error when detected by pcre2_compile(). Perl is somewhat inconsis‐
1655       tent  in  handling  such items: for example, \j is treated as a literal
1656       "j", and non-hexadecimal digits in \x{} are just ignored, though  warn‐
1657       ings  are given in both cases if Perl's warning switch is enabled. How‐
1658       ever, a malformed octal number after \o{  always  causes  an  error  in
1659       Perl.
1660
1661       If  the  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL  extra  option  is passed to
1662       pcre2_compile(), all unrecognized or  malformed  escape  sequences  are
1663       treated  as  single-character escapes. For example, \j is a literal "j"
1664       and \x{2z} is treated as  the  literal  string  "x{2z}".  Setting  this
1665       option  means  that  typos in patterns may go undetected and have unex‐
1666       pected results. Also note that a sequence such as [\N{] is  interpreted
1667       as  a  malformed attempt at [\N{...}] and so is treated as [N{] whereas
1668       [\N] gives an error  because  an  unqualified  \N  is  a  valid  escape
1669       sequence  but is not supported in a character class. To reiterate: this
1670       is a dangerous option. Use with great care.
1671
1672         PCRE2_EXTRA_ESCAPED_CR_IS_LF
1673
1674       There are some legacy applications where the escape sequence  \r  in  a
1675       pattern  is expected to match a newline. If this option is set, \r in a
1676       pattern is converted to \n so that it matches a LF  (linefeed)  instead
1677       of  a CR (carriage return) character. The option does not affect a lit‐
1678       eral CR in the pattern, nor does it affect CR specified as an  explicit
1679       code point such as \x{0D}.
1680
1681         PCRE2_EXTRA_MATCH_LINE
1682
1683       This  option  is  provided  for  use  by the -x option of pcre2grep. It
1684       causes the pattern only to match complete lines. This  is  achieved  by
1685       automatically  inserting  the  code for "^(?:" at the start of the com‐
1686       piled pattern and ")$" at the end. Thus, when PCRE2_MULTILINE  is  set,
1687       the  matched  line  may  be  in  the middle of the subject string. This
1688       option can be used with PCRE2_LITERAL.
1689
1690         PCRE2_EXTRA_MATCH_WORD
1691
1692       This option is provided for use by  the  -w  option  of  pcre2grep.  It
1693       causes  the  pattern only to match strings that have a word boundary at
1694       the start and the end. This is achieved by automatically inserting  the
1695       code  for "\b(?:" at the start of the compiled pattern and ")\b" at the
1696       end. The option may be used with PCRE2_LITERAL. However, it is  ignored
1697       if PCRE2_EXTRA_MATCH_LINE is also set.
1698

JUST-IN-TIME (JIT) COMPILATION

1700
1701       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
1702
1703       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
1704         PCRE2_SIZE length, PCRE2_SIZE startoffset,
1705         uint32_t options, pcre2_match_data *match_data,
1706         pcre2_match_context *mcontext);
1707
1708       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
1709
1710       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
1711         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
1712
1713       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
1714         pcre2_jit_callback callback_function, void *callback_data);
1715
1716       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
1717
1718       These  functions  provide  support  for  JIT compilation, which, if the
1719       just-in-time compiler is available, further processes a  compiled  pat‐
1720       tern into machine code that executes much faster than the pcre2_match()
1721       interpretive matching function. Full details are given in the  pcre2jit
1722       documentation.
1723
1724       JIT  compilation  is  a heavyweight optimization. It can take some time
1725       for patterns to be analyzed, and for one-off matches  and  simple  pat‐
1726       terns  the benefit of faster execution might be offset by a much slower
1727       compilation time.  Most (but not all) patterns can be optimized by  the
1728       JIT compiler.
1729

LOCALE SUPPORT

1731
1732       PCRE2  handles caseless matching, and determines whether characters are
1733       letters, digits, or whatever, by reference to a set of tables,  indexed
1734       by  character  code  point.  This applies only to characters whose code
1735       points are less than 256. By default, higher-valued code  points  never
1736       match  escapes  such as \w or \d.  However, if PCRE2 is built with Uni‐
1737       code support, all characters can be tested with \p and \P, or, alterna‐
1738       tively,  the  PCRE2_UCP  option  can be set when a pattern is compiled;
1739       this causes \w and friends to use Unicode property support  instead  of
1740       the built-in tables.
1741
1742       The  use  of  locales  with Unicode is discouraged. If you are handling
1743       characters with code points greater than 128,  you  should  either  use
1744       Unicode support, or use locales, but not try to mix the two.
1745
1746       PCRE2  contains  an  internal  set of character tables that are used by
1747       default.  These are sufficient for  many  applications.  Normally,  the
1748       internal tables recognize only ASCII characters. However, when PCRE2 is
1749       built, it is possible to cause the internal tables to be rebuilt in the
1750       default "C" locale of the local system, which may cause them to be dif‐
1751       ferent.
1752
1753       The internal tables can be overridden by tables supplied by the  appli‐
1754       cation  that  calls  PCRE2.  These may be created in a different locale
1755       from the default.  As more and more applications change to  using  Uni‐
1756       code, the need for this locale support is expected to die away.
1757
1758       External  tables  are built by calling the pcre2_maketables() function,
1759       in the relevant locale. The result can be passed to pcre2_compile()  as
1760       often   as  necessary,  by  creating  a  compile  context  and  calling
1761       pcre2_set_character_tables() to set the  tables  pointer  therein.  For
1762       example,  to  build  and use tables that are appropriate for the French
1763       locale (where accented characters with  values  greater  than  128  are
1764       treated as letters), the following code could be used:
1765
1766         setlocale(LC_CTYPE, "fr_FR");
1767         tables = pcre2_maketables(NULL);
1768         ccontext = pcre2_compile_context_create(NULL);
1769         pcre2_set_character_tables(ccontext, tables);
1770         re = pcre2_compile(..., ccontext);
1771
1772       The  locale  name "fr_FR" is used on Linux and other Unix-like systems;
1773       if you are using Windows, the name for the French locale  is  "french".
1774       It  is the caller's responsibility to ensure that the memory containing
1775       the tables remains available for as long as it is needed.
1776
1777       The pointer that is passed (via the compile context) to pcre2_compile()
1778       is  saved  with  the  compiled pattern, and the same tables are used by
1779       pcre2_match() and pcre_dfa_match(). Thus, for any single pattern,  com‐
1780       pilation  and  matching  both  happen in the same locale, but different
1781       patterns can be processed in different locales.
1782

INFORMATION ABOUT A COMPILED PATTERN

1784
1785       int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
1786
1787       The pcre2_pattern_info() function returns general information  about  a
1788       compiled pattern. For information about callouts, see the next section.
1789       The first argument for pcre2_pattern_info() is a pointer  to  the  com‐
1790       piled pattern. The second argument specifies which piece of information
1791       is required, and the third argument is  a  pointer  to  a  variable  to
1792       receive  the data. If the third argument is NULL, the first argument is
1793       ignored, and the function returns the size in  bytes  of  the  variable
1794       that is required for the information requested. Otherwise, the yield of
1795       the function is zero for success, or one of the following negative num‐
1796       bers:
1797
1798         PCRE2_ERROR_NULL           the argument code was NULL
1799         PCRE2_ERROR_BADMAGIC       the "magic number" was not found
1800         PCRE2_ERROR_BADOPTION      the value of what was invalid
1801         PCRE2_ERROR_UNSET          the requested field is not set
1802
1803       The  "magic  number" is placed at the start of each compiled pattern as
1804       an simple check against passing an arbitrary memory pointer. Here is  a
1805       typical  call of pcre2_pattern_info(), to obtain the length of the com‐
1806       piled pattern:
1807
1808         int rc;
1809         size_t length;
1810         rc = pcre2_pattern_info(
1811           re,               /* result of pcre2_compile() */
1812           PCRE2_INFO_SIZE,  /* what is required */
1813           &length);         /* where to put the data */
1814
1815       The possible values for the second argument are defined in pcre2.h, and
1816       are as follows:
1817
1818         PCRE2_INFO_ALLOPTIONS
1819         PCRE2_INFO_ARGOPTIONS
1820         PCRE2_INFO_EXTRAOPTIONS
1821
1822       Return copies of the pattern's options. The third argument should point
1823       to a  uint32_t  variable.  PCRE2_INFO_ARGOPTIONS  returns  exactly  the
1824       options  that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP‐
1825       TIONS returns the compile options as modified by any  top-level  (*XXX)
1826       option  settings  such  as  (*UTF)  at the start of the pattern itself.
1827       PCRE2_INFO_EXTRAOPTIONS returns the extra options that were set in  the
1828       compile  context by calling the pcre2_set_compile_extra_options() func‐
1829       tion.
1830
1831       For  example,  if  the  pattern  /(*UTF)abc/  is  compiled   with   the
1832       PCRE2_EXTENDED   option,   the   result  for  PCRE2_INFO_ALLOPTIONS  is
1833       PCRE2_EXTENDED and PCRE2_UTF.  Option settings such as  (?i)  that  can
1834       change  within  a pattern do not affect the result of PCRE2_INFO_ALLOP‐
1835       TIONS, even if they appear right at the start of the pattern. (This was
1836       different in some earlier releases.)
1837
1838       A  pattern compiled without PCRE2_ANCHORED is automatically anchored by
1839       PCRE2 if the first significant item in every top-level branch is one of
1840       the following:
1841
1842         ^     unless PCRE2_MULTILINE is set
1843         \A    always
1844         \G    always
1845         .*    sometimes - see below
1846
1847       When  .* is the first significant item, anchoring is possible only when
1848       all the following are true:
1849
1850         .* is not in an atomic group
1851         .* is not in a capture group that is the subject
1852              of a backreference
1853         PCRE2_DOTALL is in force for .*
1854         Neither (*PRUNE) nor (*SKIP) appears in the pattern
1855         PCRE2_NO_DOTSTAR_ANCHOR is not set
1856
1857       For patterns that are auto-anchored, the PCRE2_ANCHORED bit is  set  in
1858       the options returned for PCRE2_INFO_ALLOPTIONS.
1859
1860         PCRE2_INFO_BACKREFMAX
1861
1862       Return  the  number  of  the  highest backreference in the pattern. The
1863       third argument should point to  an  uint32_t  variable.  Named  capture
1864       groups  acquire  numbers  as well as names, and these count towards the
1865       highest backreference. Backreferences such as \4 or  \g{12}  match  the
1866       captured characters of the given group, but in addition, the check that
1867       a capture group is set in a conditional group such as (?(3)a|b) is also
1868       a backreference.  Zero is returned if there are no backreferences.
1869
1870         PCRE2_INFO_BSR
1871
1872       The  output  is a uint32_t integer whose value indicates what character
1873       sequences the \R escape sequence matches. A value of  PCRE2_BSR_UNICODE
1874       means  that  \R  matches  any  Unicode line ending sequence; a value of
1875       PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF.
1876
1877         PCRE2_INFO_CAPTURECOUNT
1878
1879       Return the highest capture group number in  the  pattern.  In  patterns
1880       where (?| is not used, this is also the total number of capture groups.
1881       The third argument should point to an uint32_t variable.
1882
1883         PCRE2_INFO_DEPTHLIMIT
1884
1885       If the pattern set a backtracking depth limit by including an  item  of
1886       the  form  (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The
1887       third argument should point to a uint32_t integer. If no such value has
1888       been   set,   the   call  to  pcre2_pattern_info()  returns  the  error
1889       PCRE2_ERROR_UNSET. Note that this limit will only be used during match‐
1890       ing  if it is less than the limit set or defaulted by the caller of the
1891       match function.
1892
1893         PCRE2_INFO_FIRSTBITMAP
1894
1895       In the absence of a single first code unit for a non-anchored  pattern,
1896       pcre2_compile()  may construct a 256-bit table that defines a fixed set
1897       of values for the first code unit in any match. For example, a  pattern
1898       that  starts  with  [abc]  results in a table with three bits set. When
1899       code unit values greater than 255 are supported, the flag bit  for  255
1900       means  "any  code unit of value 255 or above". If such a table was con‐
1901       structed, a pointer to it is returned. Otherwise NULL is returned.  The
1902       third argument should point to a const uint8_t * variable.
1903
1904         PCRE2_INFO_FIRSTCODETYPE
1905
1906       Return information about the first code unit of any matched string, for
1907       a non-anchored pattern. The third argument should point to an  uint32_t
1908       variable.  If there is a fixed first value, for example, the letter "c"
1909       from a pattern such as (cat|cow|coyote), 1 is returned, and  the  value
1910       can  be  retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed
1911       first value, but it is known that a match can occur only at  the  start
1912       of  the  subject  or following a newline in the subject, 2 is returned.
1913       Otherwise, and for anchored patterns, 0 is returned.
1914
1915         PCRE2_INFO_FIRSTCODEUNIT
1916
1917       Return the value of the first code unit of any  matched  string  for  a
1918       pattern  where  PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
1919       The third argument should point to an uint32_t variable. In  the  8-bit
1920       library,  the  value is always less than 256. In the 16-bit library the
1921       value can be up to 0xffff. In the 32-bit library  in  UTF-32  mode  the
1922       value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
1923       mode.
1924
1925         PCRE2_INFO_FRAMESIZE
1926
1927       Return the size (in bytes) of the data frames that are used to remember
1928       backtracking  positions  when the pattern is processed by pcre2_match()
1929       without the use of JIT. The third argument should  point  to  a  size_t
1930       variable. The frame size depends on the number of capturing parentheses
1931       in the pattern. Each additional capture group adds two PCRE2_SIZE vari‐
1932       ables.
1933
1934         PCRE2_INFO_HASBACKSLASHC
1935
1936       Return  1 if the pattern contains any instances of \C, otherwise 0. The
1937       third argument should point to an uint32_t variable.
1938
1939         PCRE2_INFO_HASCRORLF
1940
1941       Return 1 if the pattern contains any explicit  matches  for  CR  or  LF
1942       characters, otherwise 0. The third argument should point to an uint32_t
1943       variable. An explicit match is either a literal CR or LF character,  or
1944       \r  or  \n  or  one  of  the  equivalent  hexadecimal  or  octal escape
1945       sequences.
1946
1947         PCRE2_INFO_HEAPLIMIT
1948
1949       If the pattern set a heap memory limit by including an item of the form
1950       (*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argu‐
1951       ment should point to a uint32_t integer. If no such value has been set,
1952       the  call  to pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET.
1953       Note that this limit will only be used during matching if  it  is  less
1954       than the limit set or defaulted by the caller of the match function.
1955
1956         PCRE2_INFO_JCHANGED
1957
1958       Return  1  if  the (?J) or (?-J) option setting is used in the pattern,
1959       otherwise 0. The third argument should point to an  uint32_t  variable.
1960       (?J)  and  (?-J) set and unset the local PCRE2_DUPNAMES option, respec‐
1961       tively.
1962
1963         PCRE2_INFO_JITSIZE
1964
1965       If the compiled pattern was successfully  processed  by  pcre2_jit_com‐
1966       pile(),  return  the  size  of  the JIT compiled code, otherwise return
1967       zero. The third argument should point to a size_t variable.
1968
1969         PCRE2_INFO_LASTCODETYPE
1970
1971       Returns 1 if there is a rightmost literal code unit that must exist  in
1972       any  matched string, other than at its start. The third argument should
1973       point to an uint32_t  variable.  If  there  is  no  such  value,  0  is
1974       returned.  When  1  is  returned,  the  code  unit  value itself can be
1975       retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a  last
1976       literal  value  is  recorded  only  if it follows something of variable
1977       length. For example, for the pattern /^a\d+z\d+/ the returned value  is
1978       1  (with  "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/
1979       the returned value is 0.
1980
1981         PCRE2_INFO_LASTCODEUNIT
1982
1983       Return the value of the rightmost literal code unit that must exist  in
1984       any  matched  string,  other  than  at  its  start, for a pattern where
1985       PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argu‐
1986       ment should point to an uint32_t variable.
1987
1988         PCRE2_INFO_MATCHEMPTY
1989
1990       Return  1  if the pattern might match an empty string, otherwise 0. The
1991       third argument should point to an uint32_t  variable.  When  a  pattern
1992       contains recursive subroutine calls it is not always possible to deter‐
1993       mine whether or not it can match an empty string. PCRE2  takes  a  cau‐
1994       tious approach and returns 1 in such cases.
1995
1996         PCRE2_INFO_MATCHLIMIT
1997
1998       If  the  pattern  set  a  match  limit by including an item of the form
1999       (*LIMIT_MATCH=nnnn) at the start, the  value  is  returned.  The  third
2000       argument  should point to a uint32_t integer. If no such value has been
2001       set,   the   call   to   pcre2_pattern_info()   returns    the    error
2002       PCRE2_ERROR_UNSET. Note that this limit will only be used during match‐
2003       ing if it is less than the limit set or defaulted by the caller of  the
2004       match function.
2005
2006         PCRE2_INFO_MAXLOOKBEHIND
2007
2008       Return the number of characters (not code units) in the longest lookbe‐
2009       hind assertion in the pattern. The third argument  should  point  to  a
2010       uint32_t  integer.  This information is useful when doing multi-segment
2011       matching using the partial matching facilities. Note  that  the  simple
2012       assertions \b and \B require a one-character lookbehind. \A also regis‐
2013       ters a one-character lookbehind, though it does  not  actually  inspect
2014       the  previous  character. This is to ensure that at least one character
2015       from the old segment is retained when a new segment is processed.  Oth‐
2016       erwise,  if  there  are  no  lookbehinds in the pattern, \A might match
2017       incorrectly at the start of a second or subsequent segment.
2018
2019         PCRE2_INFO_MINLENGTH
2020
2021       If a minimum length for matching  subject  strings  was  computed,  its
2022       value  is  returned.  Otherwise the returned value is 0. The value is a
2023       number of characters, which in UTF mode may be different from the  num‐
2024       ber  of  code  units.   The  third argument should point to an uint32_t
2025       variable. The value is a lower bound to  the  length  of  any  matching
2026       string.  There  may  not be any strings of that length that do actually
2027       match, but every string that does match is at least that long.
2028
2029         PCRE2_INFO_NAMECOUNT
2030         PCRE2_INFO_NAMEENTRYSIZE
2031         PCRE2_INFO_NAMETABLE
2032
2033       PCRE2 supports the use of named as well as numbered capturing parenthe‐
2034       ses.  The names are just an additional way of identifying the parenthe‐
2035       ses, which still acquire numbers. Several convenience functions such as
2036       pcre2_substring_get_byname()  are provided for extracting captured sub‐
2037       strings by name. It is also possible to extract the data  directly,  by
2038       first  converting  the  name to a number in order to access the correct
2039       pointers in the output vector (described with pcre2_match() below).  To
2040       do  the  conversion,  you  need to use the name-to-number map, which is
2041       described by these three values.
2042
2043       The map consists of a number of  fixed-size  entries.  PCRE2_INFO_NAME‐
2044       COUNT  gives  the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
2045       the size of each entry in code units; both of these return  a  uint32_t
2046       value. The entry size depends on the length of the longest name.
2047
2048       PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
2049       This is a PCRE2_SPTR pointer to a block of code  units.  In  the  8-bit
2050       library,  the  first two bytes of each entry are the number of the cap‐
2051       turing parenthesis, most significant byte first. In the 16-bit library,
2052       the  pointer  points  to 16-bit code units, the first of which contains
2053       the parenthesis number. In the 32-bit library, the  pointer  points  to
2054       32-bit  code units, the first of which contains the parenthesis number.
2055       The rest of the entry is the corresponding name, zero terminated.
2056
2057       The names are in alphabetical order. If (?| is used to create  multiple
2058       capture  groups  with  the  same number, as described in the section on
2059       duplicate group numbers in the pcre2pattern page,  the  groups  may  be
2060       given  the same name, but there is only one entry in the table. Differ‐
2061       ent names for groups of the same number are not permitted.
2062
2063       Duplicate names for capture groups with different numbers  are  permit‐
2064       ted, but only if PCRE2_DUPNAMES is set. They appear in the table in the
2065       order in which they were found in the pattern. In the  absence  of  (?|
2066       this  is  the  order of increasing number; when (?| is used this is not
2067       necessarily the case because later capture groups may have  lower  num‐
2068       bers.
2069
2070       As  a  simple  example of the name/number table, consider the following
2071       pattern after compilation by the 8-bit library  (assume  PCRE2_EXTENDED
2072       is set, so white space - including newlines - is ignored):
2073
2074         (?<date> (?<year>(\d\d)?\d\d) -
2075         (?<month>\d\d) - (?<day>\d\d) )
2076
2077       There are four named capture groups, so the table has four entries, and
2078       each entry in the table is eight bytes long. The table is  as  follows,
2079       with non-printing bytes shows in hexadecimal, and undefined bytes shown
2080       as ??:
2081
2082         00 01 d  a  t  e  00 ??
2083         00 05 d  a  y  00 ?? ??
2084         00 04 m  o  n  t  h  00
2085         00 02 y  e  a  r  00 ??
2086
2087       When writing code to extract data from named capture groups  using  the
2088       name-to-number  map,  remember that the length of the entries is likely
2089       to be different for each compiled pattern.
2090
2091         PCRE2_INFO_NEWLINE
2092
2093       The output is one of the following uint32_t values:
2094
2095         PCRE2_NEWLINE_CR       Carriage return (CR)
2096         PCRE2_NEWLINE_LF       Linefeed (LF)
2097         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
2098         PCRE2_NEWLINE_ANY      Any Unicode line ending
2099         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
2100         PCRE2_NEWLINE_NUL      The NUL character (binary zero)
2101
2102       This identifies the character sequence that will be recognized as mean‐
2103       ing "newline" while matching.
2104
2105         PCRE2_INFO_SIZE
2106
2107       Return  the  size  of  the  compiled  pattern  in  bytes (for all three
2108       libraries). The third argument should point to a size_t variable.  This
2109       value  includes  the  size  of the general data block that precedes the
2110       code units of the compiled pattern itself. The value that is used  when
2111       pcre2_compile()  is  getting memory in which to place the compiled pat‐
2112       tern may be slightly larger than the value  returned  by  this  option,
2113       because  there are cases where the code that calculates the size has to
2114       over-estimate. Processing a pattern with  the  JIT  compiler  does  not
2115       alter the value returned by this option.
2116

INFORMATION ABOUT A PATTERN'S CALLOUTS

2118
2119       int pcre2_callout_enumerate(const pcre2_code *code,
2120         int (*callback)(pcre2_callout_enumerate_block *, void *),
2121         void *user_data);
2122
2123       A script language that supports the use of string arguments in callouts
2124       might like to scan all the callouts in a  pattern  before  running  the
2125       match. This can be done by calling pcre2_callout_enumerate(). The first
2126       argument is a pointer to a compiled pattern, the  second  points  to  a
2127       callback  function,  and the third is arbitrary user data. The callback
2128       function is called for every callout in the pattern  in  the  order  in
2129       which they appear. Its first argument is a pointer to a callout enumer‐
2130       ation block, and its second argument is the user_data  value  that  was
2131       passed  to  pcre2_callout_enumerate(). The contents of the callout enu‐
2132       meration block are described in the pcre2callout  documentation,  which
2133       also gives further details about callouts.
2134

SERIALIZATION AND PRECOMPILING

2136
2137       It  is  possible  to  save  compiled patterns on disc or elsewhere, and
2138       reload them later, subject to a number of  restrictions.  The  host  on
2139       which  the  patterns  are  reloaded must be running the same version of
2140       PCRE2, with the same code unit width, and must also have the same endi‐
2141       anness,  pointer  width,  and PCRE2_SIZE type. Before compiled patterns
2142       can be saved, they must be converted to a "serialized" form,  which  in
2143       the  case of PCRE2 is really just a bytecode dump.  The functions whose
2144       names begin with pcre2_serialize_ are used for converting to  and  from
2145       the  serialized form. They are described in the pcre2serialize documen‐
2146       tation. Note that PCRE2 serialization does not  convert  compiled  pat‐
2147       terns to an abstract format like Java or .NET serialization.
2148

THE MATCH DATA BLOCK

2150
2151       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
2152         pcre2_general_context *gcontext);
2153
2154       pcre2_match_data *pcre2_match_data_create_from_pattern(
2155         const pcre2_code *code, pcre2_general_context *gcontext);
2156
2157       void pcre2_match_data_free(pcre2_match_data *match_data);
2158
2159       Information  about  a  successful  or unsuccessful match is placed in a
2160       match data block, which is an opaque  structure  that  is  accessed  by
2161       function  calls.  In particular, the match data block contains a vector
2162       of offsets into the subject string that define the matched part of  the
2163       subject  and  any  substrings  that were captured. This is known as the
2164       ovector.
2165
2166       Before calling pcre2_match(), pcre2_dfa_match(),  or  pcre2_jit_match()
2167       you must create a match data block by calling one of the creation func‐
2168       tions above. For pcre2_match_data_create(), the first argument  is  the
2169       number  of  pairs  of  offsets  in  the ovector. One pair of offsets is
2170       required to identify the string that matched the whole pattern, with an
2171       additional  pair for each captured substring. For example, a value of 4
2172       creates enough space to record the matched portion of the subject  plus
2173       three  captured  substrings. A minimum of at least 1 pair is imposed by
2174       pcre2_match_data_create(), so it is always possible to return the over‐
2175       all matched string.
2176
2177       The second argument of pcre2_match_data_create() is a pointer to a gen‐
2178       eral context, which can specify custom memory management for  obtaining
2179       the memory for the match data block. If you are not using custom memory
2180       management, pass NULL, which causes malloc() to be used.
2181
2182       For pcre2_match_data_create_from_pattern(), the  first  argument  is  a
2183       pointer to a compiled pattern. The ovector is created to be exactly the
2184       right size to hold all the substrings a pattern might capture. The sec‐
2185       ond  argument is again a pointer to a general context, but in this case
2186       if NULL is passed, the memory is obtained using the same allocator that
2187       was used for the compiled pattern (custom or default).
2188
2189       A  match  data block can be used many times, with the same or different
2190       compiled patterns. You can extract information from a match data  block
2191       after  a  match  operation  has  finished,  using  functions  that  are
2192       described in the sections on  matched  strings  and  other  match  data
2193       below.
2194
2195       When  a  call  of  pcre2_match()  fails, valid data is available in the
2196       match   block   only   when   the   error    is    PCRE2_ERROR_NOMATCH,
2197       PCRE2_ERROR_PARTIAL,  or  one  of  the  error  codes for an invalid UTF
2198       string. Exactly what is available depends on the error, and is detailed
2199       below.
2200
2201       When  one of the matching functions is called, pointers to the compiled
2202       pattern and the subject string are set in the match data block so  that
2203       they  can  be referenced by the extraction functions after a successful
2204       match. After running a match, you must not free a compiled pattern or a
2205       subject  string until after all operations on the match data block (for
2206       that match) have taken place,  unless,  in  the  case  of  the  subject
2207       string,  you  have used the PCRE2_COPY_MATCHED_SUBJECT option, which is
2208       described in the  section  entitled  "Option  bits  for  pcre2_match()"
2209       below.
2210
2211       When  a match data block itself is no longer needed, it should be freed
2212       by calling pcre2_match_data_free(). If this function is called  with  a
2213       NULL argument, it returns immediately, without doing anything.
2214

MATCHING A PATTERN: THE TRADITIONAL FUNCTION

2216
2217       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
2218         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2219         uint32_t options, pcre2_match_data *match_data,
2220         pcre2_match_context *mcontext);
2221
2222       The  function pcre2_match() is called to match a subject string against
2223       a compiled pattern, which is passed in the code argument. You can  call
2224       pcre2_match() with the same code argument as many times as you like, in
2225       order to find multiple matches in the subject string or to  match  dif‐
2226       ferent subject strings with the same pattern.
2227
2228       This  function  is  the  main  matching facility of the library, and it
2229       operates in a Perl-like manner. For specialist use  there  is  also  an
2230       alternative  matching function, which is described below in the section
2231       about the pcre2_dfa_match() function.
2232
2233       Here is an example of a simple call to pcre2_match():
2234
2235         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
2236         int rc = pcre2_match(
2237           re,             /* result of pcre2_compile() */
2238           "some string",  /* the subject string */
2239           11,             /* the length of the subject string */
2240           0,              /* start at offset 0 in the subject */
2241           0,              /* default options */
2242           md,             /* the match data block */
2243           NULL);          /* a match context; NULL means use defaults */
2244
2245       If the subject string is zero-terminated, the length can  be  given  as
2246       PCRE2_ZERO_TERMINATED. A match context must be provided if certain less
2247       common matching parameters are to be changed. For details, see the sec‐
2248       tion on the match context above.
2249
2250   The string to be matched by pcre2_match()
2251
2252       The  subject string is passed to pcre2_match() as a pointer in subject,
2253       a length in length, and a starting offset in  startoffset.  The  length
2254       and  offset  are  in  code units, not characters.  That is, they are in
2255       bytes for the 8-bit library, 16-bit code units for the 16-bit  library,
2256       and  32-bit  code units for the 32-bit library, whether or not UTF pro‐
2257       cessing is enabled.
2258
2259       If startoffset is greater than the length of the subject, pcre2_match()
2260       returns  PCRE2_ERROR_BADOFFSET.  When  the starting offset is zero, the
2261       search for a match starts at the beginning of the subject, and this  is
2262       by far the most common case. In UTF-8 or UTF-16 mode, the starting off‐
2263       set must point to the start of a character, or to the end of  the  sub‐
2264       ject  (in  UTF-32 mode, one code unit equals one character, so all off‐
2265       sets are valid). Like the  pattern  string,  the  subject  may  contain
2266       binary zeros.
2267
2268       A  non-zero  starting offset is useful when searching for another match
2269       in the same subject by calling pcre2_match()  again  after  a  previous
2270       success.   Setting  startoffset  differs  from passing over a shortened
2271       string and setting PCRE2_NOTBOL in the case of a  pattern  that  begins
2272       with any kind of lookbehind. For example, consider the pattern
2273
2274         \Biss\B
2275
2276       which  finds  occurrences  of "iss" in the middle of words. (\B matches
2277       only if the current position in the subject is not  a  word  boundary.)
2278       When applied to the string "Mississipi" the first call to pcre2_match()
2279       finds the first occurrence. If pcre2_match() is called again with  just
2280       the  remainder  of  the  subject,  namely  "issipi", it does not match,
2281       because \B is always false at the start of the subject, which is deemed
2282       to  be  a word boundary. However, if pcre2_match() is passed the entire
2283       string again, but with startoffset set to 4, it finds the second occur‐
2284       rence  of "iss" because it is able to look behind the starting point to
2285       discover that it is preceded by a letter.
2286
2287       Finding all the matches in a subject is tricky  when  the  pattern  can
2288       match an empty string. It is possible to emulate Perl's /g behaviour by
2289       first  trying  the  match  again  at  the   same   offset,   with   the
2290       PCRE2_NOTEMPTY_ATSTART  and  PCRE2_ANCHORED  options,  and then if that
2291       fails, advancing the starting  offset  and  trying  an  ordinary  match
2292       again.  There  is  some  code  that  demonstrates how to do this in the
2293       pcre2demo sample program. In the most general case, you have  to  check
2294       to  see  if the newline convention recognizes CRLF as a newline, and if
2295       so, and the current character is CR followed by LF, advance the  start‐
2296       ing offset by two characters instead of one.
2297
2298       If a non-zero starting offset is passed when the pattern is anchored, a
2299       single attempt to match at the given offset is made. This can only suc‐
2300       ceed  if  the  pattern does not require the match to be at the start of
2301       the subject. In other words, the anchoring must be the result  of  set‐
2302       ting  the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL, not
2303       by starting the pattern with ^ or \A.
2304
2305   Option bits for pcre2_match()
2306
2307       The unused bits of the options argument for pcre2_match() must be zero.
2308       The    only    bits    that    may    be    set   are   PCRE2_ANCHORED,
2309       PCRE2_COPY_MATCHED_SUBJECT,      PCRE2_ENDANCHORED,       PCRE2_NOTBOL,
2310       PCRE2_NOTEOL,   PCRE2_NOTEMPTY,  PCRE2_NOTEMPTY_ATSTART,  PCRE2_NO_JIT,
2311       PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and  PCRE2_PARTIAL_SOFT.  Their
2312       action is described below.
2313
2314       Setting  PCRE2_ANCHORED  or PCRE2_ENDANCHORED at match time is not sup‐
2315       ported by the just-in-time (JIT) compiler. If it is set,  JIT  matching
2316       is  disabled  and  the interpretive code in pcre2_match() is run. Apart
2317       from PCRE2_NO_JIT (obviously), the remaining options are supported  for
2318       JIT matching.
2319
2320         PCRE2_ANCHORED
2321
2322       The PCRE2_ANCHORED option limits pcre2_match() to matching at the first
2323       matching position. If a pattern was compiled  with  PCRE2_ANCHORED,  or
2324       turned  out to be anchored by virtue of its contents, it cannot be made
2325       unachored at matching time. Note that setting the option at match  time
2326       disables JIT matching.
2327
2328         PCRE2_COPY_MATCHED_SUBJECT
2329
2330       By  default,  a  pointer to the subject is remembered in the match data
2331       block so that, after a successful match, it can be  referenced  by  the
2332       substring  extraction  functions.  This means that the subject's memory
2333       must not be freed until all such  operations  are  complete.  For  some
2334       applications  where  the  lifetime of the subject string is not guaran‐
2335       teed, it may be necessary to make a copy of the subject string, but  it
2336       is wasteful to do this unless the match is successful. After a success‐
2337       ful match, if PCRE2_COPY_MATCHED_SUBJECT is set, the subject is  copied
2338       and  the  new  pointer is remembered in the match data block instead of
2339       the original subject pointer. The memory allocator that  was  used  for
2340       the  match  block  itself is used. The copy is automatically freed when
2341       pcre2_match_data_free() is called to free the match data block.  It  is
2342       also automatically freed if the match data block is re-used for another
2343       match operation.
2344
2345         PCRE2_ENDANCHORED
2346
2347       If the PCRE2_ENDANCHORED option is set, any string  that  pcre2_match()
2348       matches  must be right at the end of the subject string. Note that set‐
2349       ting the option at match time disables JIT matching.
2350
2351         PCRE2_NOTBOL
2352
2353       This option specifies that first character of the subject string is not
2354       the  beginning  of  a  line, so the circumflex metacharacter should not
2355       match before it. Setting this without  having  set  PCRE2_MULTILINE  at
2356       compile time causes circumflex never to match. This option affects only
2357       the behaviour of the circumflex metacharacter. It does not affect \A.
2358
2359         PCRE2_NOTEOL
2360
2361       This option specifies that the end of the subject string is not the end
2362       of  a line, so the dollar metacharacter should not match it nor (except
2363       in multiline mode) a newline immediately before it. Setting this  with‐
2364       out  having  set PCRE2_MULTILINE at compile time causes dollar never to
2365       match. This option affects only the behaviour of the dollar metacharac‐
2366       ter. It does not affect \Z or \z.
2367
2368         PCRE2_NOTEMPTY
2369
2370       An empty string is not considered to be a valid match if this option is
2371       set. If there are alternatives in the pattern, they are tried.  If  all
2372       the  alternatives  match  the empty string, the entire match fails. For
2373       example, if the pattern
2374
2375         a?b?
2376
2377       is applied to a string not beginning with "a" or  "b",  it  matches  an
2378       empty string at the start of the subject. With PCRE2_NOTEMPTY set, this
2379       match is not valid, so pcre2_match() searches further into  the  string
2380       for occurrences of "a" or "b".
2381
2382         PCRE2_NOTEMPTY_ATSTART
2383
2384       This  is  like PCRE2_NOTEMPTY, except that it locks out an empty string
2385       match only at the first matching position, that is, at the start of the
2386       subject  plus  the  starting offset. An empty string match later in the
2387       subject is permitted.  If the pattern is anchored,  such  a  match  can
2388       occur only if the pattern contains \K.
2389
2390         PCRE2_NO_JIT
2391
2392       By   default,   if   a  pattern  has  been  successfully  processed  by
2393       pcre2_jit_compile(), JIT is automatically used  when  pcre2_match()  is
2394       called  with  options  that JIT supports. Setting PCRE2_NO_JIT disables
2395       the use of JIT; it forces matching to be done by the interpreter.
2396
2397         PCRE2_NO_UTF_CHECK
2398
2399       When PCRE2_UTF is set at compile time, the validity of the subject as a
2400       UTF  string  is  checked  by default when pcre2_match() is subsequently
2401       called.  If a non-zero starting offset is given, the check  is  applied
2402       only  to that part of the subject that could be inspected during match‐
2403       ing, and there is a check that the starting offset points to the  first
2404       code  unit of a character or to the end of the subject. If there are no
2405       lookbehind assertions in the pattern, the check starts at the  starting
2406       offset.  Otherwise,  it  starts at the length of the longest lookbehind
2407       before the starting offset, or at the start of the subject if there are
2408       not  that  many  characters  before  the starting offset. Note that the
2409       sequences \b and \B are one-character lookbehinds.
2410
2411       The check is carried out before any other processing takes place, and a
2412       negative  error  code is returned if the check fails. There are several
2413       UTF error codes for each code unit width,  corresponding  to  different
2414       problems  with  the code unit sequence. There are discussions about the
2415       validity of UTF-8 strings, UTF-16 strings, and UTF-32  strings  in  the
2416       pcre2unicode page.
2417
2418       If  you  know  that  your  subject is valid, and you want to skip these
2419       checks for performance reasons,  you  can  set  the  PCRE2_NO_UTF_CHECK
2420       option  when  calling  pcre2_match(). You might want to do this for the
2421       second and subsequent calls to pcre2_match() if you are making repeated
2422       calls to find other matches in the same subject string.
2423
2424       Warning:  When  PCRE2_NO_UTF_CHECK  is  set,  the  effect of passing an
2425       invalid string as a subject, or an invalid  value  of  startoffset,  is
2426       undefined.  Your program may crash or loop indefinitely.
2427
2428         PCRE2_PARTIAL_HARD
2429         PCRE2_PARTIAL_SOFT
2430
2431       These  options  turn  on  the partial matching feature. A partial match
2432       occurs if the end of the subject string is  reached  successfully,  but
2433       there  are not enough subject characters to complete the match. If this
2434       happens when PCRE2_PARTIAL_SOFT (but not  PCRE2_PARTIAL_HARD)  is  set,
2435       matching  continues  by  testing any remaining alternatives. Only if no
2436       complete match can be found is PCRE2_ERROR_PARTIAL returned instead  of
2437       PCRE2_ERROR_NOMATCH.  In other words, PCRE2_PARTIAL_SOFT specifies that
2438       the caller is prepared to handle a partial match, but only if  no  com‐
2439       plete match can be found.
2440
2441       If  PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this
2442       case, if a partial match is found,  pcre2_match()  immediately  returns
2443       PCRE2_ERROR_PARTIAL,  without  considering  any  other alternatives. In
2444       other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid‐
2445       ered to be more important that an alternative complete match.
2446
2447       There is a more detailed discussion of partial and multi-segment match‐
2448       ing, with examples, in the pcre2partial documentation.
2449

NEWLINE HANDLING WHEN MATCHING

2451
2452       When PCRE2 is built, a default newline convention is set; this is  usu‐
2453       ally  the standard convention for the operating system. The default can
2454       be overridden in a compile context by calling  pcre2_set_newline().  It
2455       can  also be overridden by starting a pattern string with, for example,
2456       (*CRLF), as described in the section  on  newline  conventions  in  the
2457       pcre2pattern  page. During matching, the newline choice affects the be‐
2458       haviour of the dot, circumflex, and dollar metacharacters. It may  also
2459       alter  the  way  the  match starting position is advanced after a match
2460       failure for an unanchored pattern.
2461
2462       When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is
2463       set  as  the  newline convention, and a match attempt for an unanchored
2464       pattern fails when the current starting position is at a CRLF sequence,
2465       and  the  pattern contains no explicit matches for CR or LF characters,
2466       the match position is advanced by two characters  instead  of  one,  in
2467       other words, to after the CRLF.
2468
2469       The above rule is a compromise that makes the most common cases work as
2470       expected. For example, if the pattern  is  .+A  (and  the  PCRE2_DOTALL
2471       option is not set), it does not match the string "\r\nA" because, after
2472       failing at the start, it skips both the CR and the LF before  retrying.
2473       However,  the  pattern  [\r\n]A does match that string, because it con‐
2474       tains an explicit CR or LF reference, and so advances only by one char‐
2475       acter after the first failure.
2476
2477       An explicit match for CR of LF is either a literal appearance of one of
2478       those characters in the pattern, or one of the \r or \n  or  equivalent
2479       octal or hexadecimal escape sequences. Implicit matches such as [^X] do
2480       not count, nor does \s, even though it includes CR and LF in the  char‐
2481       acters that it matches.
2482
2483       Notwithstanding  the above, anomalous effects may still occur when CRLF
2484       is a valid newline sequence and explicit \r or \n escapes appear in the
2485       pattern.
2486

HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

2488
2489       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
2490
2491       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
2492
2493       In  general, a pattern matches a certain portion of the subject, and in
2494       addition, further substrings from the subject  may  be  picked  out  by
2495       parenthesized  parts  of  the  pattern.  Following the usage in Jeffrey
2496       Friedl's book, this is called "capturing"  in  what  follows,  and  the
2497       phrase  "capture  group" (Perl terminology) is used for a fragment of a
2498       pattern that picks out a substring. PCRE2 supports several other  kinds
2499       of parenthesized group that do not cause substrings to be captured. The
2500       pcre2_pattern_info() function can be used to find out how many  capture
2501       groups there are in a compiled pattern.
2502
2503       You  can  use  auxiliary functions for accessing captured substrings by
2504       number or by name, as described in sections below.
2505
2506       Alternatively, you can make direct use of the vector of PCRE2_SIZE val‐
2507       ues,  called  the  ovector,  which  contains  the  offsets  of captured
2508       strings.  It  is  part  of  the  match  data   block.    The   function
2509       pcre2_get_ovector_pointer()  returns  the  address  of the ovector, and
2510       pcre2_get_ovector_count() returns the number of pairs of values it con‐
2511       tains.
2512
2513       Within the ovector, the first in each pair of values is set to the off‐
2514       set of the first code unit of a substring, and the second is set to the
2515       offset  of the first code unit after the end of a substring. These val‐
2516       ues are always code unit offsets, not character offsets. That is,  they
2517       are  byte  offsets  in  the 8-bit library, 16-bit offsets in the 16-bit
2518       library, and 32-bit offsets in the 32-bit library.
2519
2520       After a partial match  (error  return  PCRE2_ERROR_PARTIAL),  only  the
2521       first  pair  of  offsets  (that is, ovector[0] and ovector[1]) are set.
2522       They identify the part of the subject that was partially  matched.  See
2523       the pcre2partial documentation for details of partial matching.
2524
2525       After  a  fully  successful match, the first pair of offsets identifies
2526       the portion of the subject string that was matched by the  entire  pat‐
2527       tern.  The  next  pair is used for the first captured substring, and so
2528       on. The value returned by pcre2_match() is one more  than  the  highest
2529       numbered  pair  that  has been set. For example, if two substrings have
2530       been captured, the returned value is 3. If there are no  captured  sub‐
2531       strings, the return value from a successful match is 1, indicating that
2532       just the first pair of offsets has been set.
2533
2534       If a pattern uses the \K escape sequence within a  positive  assertion,
2535       the reported start of a successful match can be greater than the end of
2536       the match.  For example, if the pattern  (?=ab\K)  is  matched  against
2537       "ab", the start and end offset values for the match are 2 and 0.
2538
2539       If  a  capture group is matched repeatedly within a single match opera‐
2540       tion, it is the last portion of the subject that  it  matched  that  is
2541       returned.
2542
2543       If the ovector is too small to hold all the captured substring offsets,
2544       as much as possible is filled in, and the function returns a  value  of
2545       zero.  If captured substrings are not of interest, pcre2_match() may be
2546       called with a match data block whose ovector is of minimum length (that
2547       is, one pair).
2548
2549       It  is  possible for capture group number n+1 to match some part of the
2550       subject when group n has not been used at  all.  For  example,  if  the
2551       string "abc" is matched against the pattern (a|(z))(bc) the return from
2552       the function is 4, and groups 1 and 3 are matched, but 2 is  not.  When
2553       this  happens,  both values in the offset pairs corresponding to unused
2554       groups are set to PCRE2_UNSET.
2555
2556       Offset values that correspond to  unused  groups  at  the  end  of  the
2557       expression  are  also  set  to  PCRE2_UNSET. For example, if the string
2558       "abc" is matched against the pattern (abc)(x(yz)?)? groups 2 and 3  are
2559       not  matched.  The  return  from the function is 2, because the highest
2560       used capture group number is 1. The offsets  for  for  the  second  and
2561       third  capture groupss (assuming the vector is large enough, of course)
2562       are set to PCRE2_UNSET.
2563
2564       Elements in the ovector that do not correspond to capturing parentheses
2565       in the pattern are never changed. That is, if a pattern contains n cap‐
2566       turing parentheses, no more than ovector[0] to ovector[2n+1] are set by
2567       pcre2_match().  The  other  elements retain whatever values they previ‐
2568       ously had. After a failed match attempt, the contents  of  the  ovector
2569       are unchanged.
2570

OTHER INFORMATION ABOUT A MATCH

2572
2573       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
2574
2575       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
2576
2577       As  well as the offsets in the ovector, other information about a match
2578       is retained in the match data block and can be retrieved by  the  above
2579       functions  in  appropriate  circumstances.  If they are called at other
2580       times, the result is undefined.
2581
2582       After a successful match, a partial match (PCRE2_ERROR_PARTIAL),  or  a
2583       failure  to  match (PCRE2_ERROR_NOMATCH), a mark name may be available.
2584       The function pcre2_get_mark() can be called to access this name,  which
2585       can  be  specified  in  the  pattern by any of the backtracking control
2586       verbs, not just (*MARK). The same function applies to all the verbs. It
2587       returns a pointer to the zero-terminated name, which is within the com‐
2588       piled pattern. If no name is available, NULL is returned. The length of
2589       the  name  (excluding  the terminating zero) is stored in the code unit
2590       that precedes the name. You should use this length instead  of  relying
2591       on the terminating zero if the name might contain a binary zero.
2592
2593       After  a  successful  match, the name that is returned is the last mark
2594       name encountered on the matching path through the pattern. Instances of
2595       backtracking  verbs  without  names do not count. Thus, for example, if
2596       the matching path contains (*MARK:A)(*PRUNE), the name "A" is returned.
2597       After  a  "no  match"  or a partial match, the last encountered name is
2598       returned. For example, consider this pattern:
2599
2600         ^(*MARK:A)((*MARK:B)a|b)c
2601
2602       When it matches "bc", the returned name is A. The B mark is  "seen"  in
2603       the  first  branch of the group, but it is not on the matching path. On
2604       the other hand, when this pattern fails to  match  "bx",  the  returned
2605       name is B.
2606
2607       Warning:  By  default, certain start-of-match optimizations are used to
2608       give a fast "no match" result in some situations. For example,  if  the
2609       anchoring  is removed from the pattern above, there is an initial check
2610       for the presence of "c" in the  subject  before  running  the  matching
2611       engine. This check fails for "bx", causing a match failure without see‐
2612       ing any marks. You can disable the start-of-match optimizations by set‐
2613       ting  the  PCRE2_NO_START_OPTIMIZE  option  for  pcre2_compile()  or by
2614       starting the pattern with (*NO_START_OPT).
2615
2616       After a successful match, a partial match, or one of  the  invalid  UTF
2617       errors  (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can
2618       be called. After a successful or partial match it returns the code unit
2619       offset  of  the character at which the match started. For a non-partial
2620       match, this can be different to the value of ovector[0] if the  pattern
2621       contains  the  \K escape sequence. After a partial match, however, this
2622       value is always the same as ovector[0] because \K does not  affect  the
2623       result of a partial match.
2624
2625       After  a UTF check failure, pcre2_get_startchar() can be used to obtain
2626       the code unit offset of the invalid UTF character. Details are given in
2627       the pcre2unicode page.
2628

ERROR RETURNS FROM pcre2_match()

2630
2631       If  pcre2_match() fails, it returns a negative number. This can be con‐
2632       verted to a text string by calling the pcre2_get_error_message()  func‐
2633       tion  (see  "Obtaining a textual error message" below).  Negative error
2634       codes are also returned by other functions,  and  are  documented  with
2635       them.  The codes are given names in the header file. If UTF checking is
2636       in force and an invalid UTF subject string is detected, one of a number
2637       of  UTF-specific negative error codes is returned. Details are given in
2638       the pcre2unicode page. The following are the other errors that  may  be
2639       returned by pcre2_match():
2640
2641         PCRE2_ERROR_NOMATCH
2642
2643       The subject string did not match the pattern.
2644
2645         PCRE2_ERROR_PARTIAL
2646
2647       The  subject  string did not match, but it did match partially. See the
2648       pcre2partial documentation for details of partial matching.
2649
2650         PCRE2_ERROR_BADMAGIC
2651
2652       PCRE2 stores a 4-byte "magic number" at the start of the compiled code,
2653       to  catch  the case when it is passed a junk pointer. This is the error
2654       that is returned when the magic number is not present.
2655
2656         PCRE2_ERROR_BADMODE
2657
2658       This error is given when a compiled pattern is passed to a function  in
2659       a  library  of a different code unit width, for example, a pattern com‐
2660       piled by the 8-bit library is passed to  a  16-bit  or  32-bit  library
2661       function.
2662
2663         PCRE2_ERROR_BADOFFSET
2664
2665       The value of startoffset was greater than the length of the subject.
2666
2667         PCRE2_ERROR_BADOPTION
2668
2669       An unrecognized bit was set in the options argument.
2670
2671         PCRE2_ERROR_BADUTFOFFSET
2672
2673       The UTF code unit sequence that was passed as a subject was checked and
2674       found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but  the
2675       value  of startoffset did not point to the beginning of a UTF character
2676       or the end of the subject.
2677
2678         PCRE2_ERROR_CALLOUT
2679
2680       This error is never generated by pcre2_match() itself. It  is  provided
2681       for  use  by  callout  functions  that  want  to cause pcre2_match() or
2682       pcre2_callout_enumerate() to return a distinctive error code.  See  the
2683       pcre2callout documentation for details.
2684
2685         PCRE2_ERROR_DEPTHLIMIT
2686
2687       The nested backtracking depth limit was reached.
2688
2689         PCRE2_ERROR_HEAPLIMIT
2690
2691       The heap limit was reached.
2692
2693         PCRE2_ERROR_INTERNAL
2694
2695       An  unexpected  internal error has occurred. This error could be caused
2696       by a bug in PCRE2 or by overwriting of the compiled pattern.
2697
2698         PCRE2_ERROR_JIT_STACKLIMIT
2699
2700       This error is returned when a pattern  that  was  successfully  studied
2701       using  JIT  is being matched, but the memory available for the just-in-
2702       time processing stack is not large enough. See the pcre2jit  documenta‐
2703       tion for more details.
2704
2705         PCRE2_ERROR_MATCHLIMIT
2706
2707       The backtracking match limit was reached.
2708
2709         PCRE2_ERROR_NOMEMORY
2710
2711       If  a  pattern contains many nested backtracking points, heap memory is
2712       used to remember them. This error is given when the  memory  allocation
2713       function  (default  or  custom)  fails.  Note  that  a different error,
2714       PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed  exceeds
2715       the    heap   limit.   PCRE2_ERROR_NOMEMORY   is   also   returned   if
2716       PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
2717
2718         PCRE2_ERROR_NULL
2719
2720       Either the code, subject, or match_data argument was passed as NULL.
2721
2722         PCRE2_ERROR_RECURSELOOP
2723
2724       This error is returned when  pcre2_match()  detects  a  recursion  loop
2725       within  the  pattern. Specifically, it means that either the whole pat‐
2726       tern or a capture group has been called recursively for the second time
2727       at  the  same position in the subject string. Some simple patterns that
2728       might do this are detected and faulted at compile time, but  more  com‐
2729       plicated  cases,  in particular mutual recursions between two different
2730       groups, cannot be detected until matching is attempted.
2731

OBTAINING A TEXTUAL ERROR MESSAGE

2733
2734       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
2735         PCRE2_SIZE bufflen);
2736
2737       A text message for an error code  from  any  PCRE2  function  (compile,
2738       match,  or  auxiliary)  can be obtained by calling pcre2_get_error_mes‐
2739       sage(). The code is passed as the first argument,  with  the  remaining
2740       two  arguments  specifying  a  code  unit buffer and its length in code
2741       units, into which the text message is placed. The message  is  returned
2742       in  code  units  of the appropriate width for the library that is being
2743       used.
2744
2745       The returned message is terminated with a trailing zero, and the  func‐
2746       tion  returns  the  number  of  code units used, excluding the trailing
2747       zero.  If  the  error  number  is  unknown,  the  negative  error  code
2748       PCRE2_ERROR_BADDATA  is  returned. If the buffer is too small, the mes‐
2749       sage is truncated (but still with a trailing zero),  and  the  negative
2750       error  code PCRE2_ERROR_NOMEMORY is returned.  None of the messages are
2751       very long; a buffer size of 120 code units is ample.
2752

EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

2754
2755       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
2756         uint32_t number, PCRE2_SIZE *length);
2757
2758       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
2759         uint32_t number, PCRE2_UCHAR *buffer,
2760         PCRE2_SIZE *bufflen);
2761
2762       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
2763         uint32_t number, PCRE2_UCHAR **bufferptr,
2764         PCRE2_SIZE *bufflen);
2765
2766       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2767
2768       Captured substrings can be accessed directly by using  the  ovector  as
2769       described above.  For convenience, auxiliary functions are provided for
2770       extracting  captured  substrings  as  new,  separate,   zero-terminated
2771       strings. A substring that contains a binary zero is correctly extracted
2772       and has a further zero added on the end, but  the  result  is  not,  of
2773       course, a C string.
2774
2775       The functions in this section identify substrings by number. The number
2776       zero refers to the entire matched substring, with higher numbers refer‐
2777       ring  to  substrings  captured by parenthesized groups. After a partial
2778       match, only substring zero is available.  An  attempt  to  extract  any
2779       other  substring  gives the error PCRE2_ERROR_PARTIAL. The next section
2780       describes similar functions for extracting captured substrings by name.
2781
2782       If a pattern uses the \K escape sequence within a  positive  assertion,
2783       the reported start of a successful match can be greater than the end of
2784       the match.  For example, if the pattern  (?=ab\K)  is  matched  against
2785       "ab",  the  start  and  end offset values for the match are 2 and 0. In
2786       this situation, calling these functions with a  zero  substring  number
2787       extracts a zero-length empty string.
2788
2789       You  can  find the length in code units of a captured substring without
2790       extracting it by calling pcre2_substring_length_bynumber().  The  first
2791       argument  is a pointer to the match data block, the second is the group
2792       number, and the third is a pointer to a variable into which the  length
2793       is  placed.  If  you just want to know whether or not the substring has
2794       been captured, you can pass the third argument as NULL.
2795
2796       The pcre2_substring_copy_bynumber() function  copies  a  captured  sub‐
2797       string  into  a supplied buffer, whereas pcre2_substring_get_bynumber()
2798       copies it into new memory, obtained using the  same  memory  allocation
2799       function  that  was  used for the match data block. The first two argu‐
2800       ments of these functions are a pointer to the match data  block  and  a
2801       capture group number.
2802
2803       The final arguments of pcre2_substring_copy_bynumber() are a pointer to
2804       the buffer and a pointer to a variable that contains its length in code
2805       units.  This is updated to contain the actual number of code units used
2806       for the extracted substring, excluding the terminating zero.
2807
2808       For pcre2_substring_get_bynumber() the third and fourth arguments point
2809       to  variables that are updated with a pointer to the new memory and the
2810       number of code units that comprise the substring, again  excluding  the
2811       terminating  zero.  When  the substring is no longer needed, the memory
2812       should be freed by calling pcre2_substring_free().
2813
2814       The return value from all these functions is zero  for  success,  or  a
2815       negative  error  code.  If  the pattern match failed, the match failure
2816       code is returned.  If a substring number  greater  than  zero  is  used
2817       after  a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible
2818       error codes are:
2819
2820         PCRE2_ERROR_NOMEMORY
2821
2822       The buffer was too small for  pcre2_substring_copy_bynumber(),  or  the
2823       attempt to get memory failed for pcre2_substring_get_bynumber().
2824
2825         PCRE2_ERROR_NOSUBSTRING
2826
2827       There  is  no  substring  with that number in the pattern, that is, the
2828       number is greater than the number of capturing parentheses.
2829
2830         PCRE2_ERROR_UNAVAILABLE
2831
2832       The substring number, though not greater than the number of captures in
2833       the pattern, is greater than the number of slots in the ovector, so the
2834       substring could not be captured.
2835
2836         PCRE2_ERROR_UNSET
2837
2838       The substring did not participate in the match.  For  example,  if  the
2839       pattern  is  (abc)|(def) and the subject is "def", and the ovector con‐
2840       tains at least two capturing slots, substring number 1 is unset.
2841

EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

2843
2844       int pcre2_substring_list_get(pcre2_match_data *match_data,
2845         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
2846
2847       void pcre2_substring_list_free(PCRE2_SPTR *list);
2848
2849       The pcre2_substring_list_get() function  extracts  all  available  sub‐
2850       strings  and  builds  a  list of pointers to them. It also (optionally)
2851       builds a second list that  contains  their  lengths  (in  code  units),
2852       excluding a terminating zero that is added to each of them. All this is
2853       done in a single block of memory that is obtained using the same memory
2854       allocation function that was used to get the match data block.
2855
2856       This  function  must be called only after a successful match. If called
2857       after a partial match, the error code PCRE2_ERROR_PARTIAL is returned.
2858
2859       The address of the memory block is returned via listptr, which is  also
2860       the start of the list of string pointers. The end of the list is marked
2861       by a NULL pointer. The address of the list of lengths is  returned  via
2862       lengthsptr.  If your strings do not contain binary zeros and you do not
2863       therefore need the lengths, you may supply NULL as the lengthsptr argu‐
2864       ment  to  disable  the  creation of a list of lengths. The yield of the
2865       function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the  mem‐
2866       ory  block could not be obtained. When the list is no longer needed, it
2867       should be freed by calling pcre2_substring_list_free().
2868
2869       If this function encounters a substring that is unset, which can happen
2870       when  capture  group  number  n+1 matches some part of the subject, but
2871       group n has not been used at all, it returns an empty string. This  can
2872       be distinguished from a genuine zero-length substring by inspecting the
2873       appropriate offset in the ovector, which contain PCRE2_UNSET for  unset
2874       substrings, or by calling pcre2_substring_length_bynumber().
2875

EXTRACTING CAPTURED SUBSTRINGS BY NAME

2877
2878       int pcre2_substring_number_from_name(const pcre2_code *code,
2879         PCRE2_SPTR name);
2880
2881       int pcre2_substring_length_byname(pcre2_match_data *match_data,
2882         PCRE2_SPTR name, PCRE2_SIZE *length);
2883
2884       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
2885         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
2886
2887       int pcre2_substring_get_byname(pcre2_match_data *match_data,
2888         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
2889
2890       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2891
2892       To  extract a substring by name, you first have to find associated num‐
2893       ber.  For example, for this pattern:
2894
2895         (a+)b(?<xxx>\d+)...
2896
2897       the number of the capture group called "xxx" is 2. If the name is known
2898       to be unique (PCRE2_DUPNAMES was not set), you can find the number from
2899       the name by calling pcre2_substring_number_from_name(). The first argu‐
2900       ment  is the compiled pattern, and the second is the name. The yield of
2901       the function is the group number, PCRE2_ERROR_NOSUBSTRING if  there  is
2902       no  group  with that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is
2903       more than one group with that name.  Given the number, you can  extract
2904       the  substring  directly from the ovector, or use one of the "bynumber"
2905       functions described above.
2906
2907       For convenience, there are also "byname" functions that  correspond  to
2908       the  "bynumber"  functions,  the  only difference being that the second
2909       argument is a name instead of a number. If PCRE2_DUPNAMES  is  set  and
2910       there are duplicate names, these functions scan all the groups with the
2911       given name, and return the captured  substring  from  the  first  named
2912       group that is set.
2913
2914       If  there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
2915       returned. If all groups with the name have  numbers  that  are  greater
2916       than  the  number  of  slots in the ovector, PCRE2_ERROR_UNAVAILABLE is
2917       returned. If there is at least one group with a slot  in  the  ovector,
2918       but no group is found to be set, PCRE2_ERROR_UNSET is returned.
2919
2920       Warning: If the pattern uses the (?| feature to set up multiple capture
2921       groups with the same number, as described in the section  on  duplicate
2922       group numbers in the pcre2pattern page, you cannot use names to distin‐
2923       guish the different capture groups, because names are not  included  in
2924       the  compiled  code.  The  matching process uses only numbers. For this
2925       reason, the use of different names for  groups  with  the  same  number
2926       causes an error at compile time.
2927

CREATING A NEW STRING WITH SUBSTITUTIONS

2929
2930       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
2931         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2932         uint32_t options, pcre2_match_data *match_data,
2933         pcre2_match_context *mcontext, PCRE2_SPTR replacement,
2934         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
2935         PCRE2_SIZE *outlengthptr);
2936
2937       This  function calls pcre2_match() and then makes a copy of the subject
2938       string in outputbuffer, replacing one or more parts that  were  matched
2939       with the replacement string, whose length is supplied in rlength.  This
2940       can be given as PCRE2_ZERO_TERMINATED  for  a  zero-terminated  string.
2941       The  default is to perform just one replacement, but there is an option
2942       that requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL  below
2943       for details).
2944
2945       Matches  in  which  a  \K item in a lookahead in the pattern causes the
2946       match to end before it starts are not supported, and give  rise  to  an
2947       error return. For global replacements, matches in which \K in a lookbe‐
2948       hind causes the match to start earlier than the point that was  reached
2949       in the previous iteration are also not supported.
2950
2951       The  first  seven  arguments  of pcre2_substitute() are the same as for
2952       pcre2_match(), except that the partial matching options are not permit‐
2953       ted,  and  match_data may be passed as NULL, in which case a match data
2954       block is obtained and freed within this function, using memory  manage‐
2955       ment  functions from the match context, if provided, or else those that
2956       were used to allocate memory for the compiled code.
2957
2958       If an external match_data block is provided,  its  contents  afterwards
2959       are  those  set by the final call to pcre2_match(). For global changes,
2960       this will have ended in a matching error. The contents of  the  ovector
2961       within the match data block may or may not have been changed.
2962
2963       The  outlengthptr  argument  must point to a variable that contains the
2964       length, in code units, of the output buffer. If the  function  is  suc‐
2965       cessful,  the value is updated to contain the length of the new string,
2966       excluding the trailing zero that is automatically added.
2967
2968       If the function is not  successful,  the  value  set  via  outlengthptr
2969       depends  on  the  type  of  error. For syntax errors in the replacement
2970       string, the value is the offset in the  replacement  string  where  the
2971       error  was  detected.  For  other  errors,  the value is PCRE2_UNSET by
2972       default. This includes the case of the output buffer being  too  small,
2973       unless  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  is  set (see below), in which
2974       case the value is the minimum length needed, including  space  for  the
2975       trailing  zero.  Note  that  in  order  to compute the required length,
2976       pcre2_substitute() has  to  simulate  all  the  matching  and  copying,
2977       instead of giving an error return as soon as the buffer overflows. Note
2978       also that the length is in code units, not bytes.
2979
2980       In the replacement string, which is interpreted as a UTF string in  UTF
2981       mode,  and  is  checked  for UTF validity unless the PCRE2_NO_UTF_CHECK
2982       option is set, a dollar character is an escape character that can spec‐
2983       ify  the  insertion  of  characters  from  capture groups or names from
2984       (*MARK) or other control verbs in the pattern. The following forms  are
2985       always recognized:
2986
2987         $$                  insert a dollar character
2988         $<n> or ${<n>}      insert the contents of group <n>
2989         $*MARK or ${*MARK}  insert a control verb name
2990
2991       Either  a  group  number  or  a  group name can be given for <n>. Curly
2992       brackets are required only if the following character would  be  inter‐
2993       preted as part of the number or name. The number may be zero to include
2994       the entire matched string.   For  example,  if  the  pattern  a(b)c  is
2995       matched  with "=abc=" and the replacement string "+$1$0$1+", the result
2996       is "=+babcb+=".
2997
2998       $*MARK inserts the name from the last encountered backtracking  control
2999       verb  on the matching path that has a name. (*MARK) must always include
3000       a name, but the other verbs need not.  For  example,  in  the  case  of
3001       (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B)
3002       the relevant name is "B". This facility can be used to  perform  simple
3003       simultaneous substitutions, as this pcre2test example shows:
3004
3005         /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
3006             apple lemon
3007          2: pear orange
3008
3009       As  well as the usual options for pcre2_match(), a number of additional
3010       options can be set in the options argument of pcre2_substitute().
3011
3012       PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject
3013       string,  replacing every matching substring. If this option is not set,
3014       only the first matching substring is replaced. The search  for  matches
3015       takes  place in the original subject string (that is, previous replace‐
3016       ments do not affect it).  Iteration is  implemented  by  advancing  the
3017       startoffset  value  for  each search, which is always passed the entire
3018       subject string. If an offset limit is set in the match context, search‐
3019       ing stops when that limit is reached.
3020
3021       You  can  restrict  the effect of a global substitution to a portion of
3022       the subject string by setting either or both of startoffset and an off‐
3023       set limit. Here is a pcre2test example:
3024
3025         /B/g,replace=!,use_offset_limit
3026         ABC ABC ABC ABC\=offset=3,offset_limit=12
3027          2: ABC A!C A!C ABC
3028
3029       When  continuing  with  global substitutions after matching a substring
3030       with zero length, an attempt to find a non-empty match at the same off‐
3031       set is performed.  If this is not successful, the offset is advanced by
3032       one character except when CRLF is a valid newline sequence and the next
3033       two  characters are CR, LF. In this case, the offset is advanced by two
3034       characters.
3035
3036       PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when  the  output
3037       buffer is too small. The default action is to return PCRE2_ERROR_NOMEM‐
3038       ORY immediately. If this option  is  set,  however,  pcre2_substitute()
3039       continues to go through the motions of matching and substituting (with‐
3040       out, of course, writing anything) in order to compute the size of  buf‐
3041       fer  that  is  needed.  This  value is passed back via the outlengthptr
3042       variable,   with   the   result   of   the   function    still    being
3043       PCRE2_ERROR_NOMEMORY.
3044
3045       Passing  a  buffer  size  of zero is a permitted way of finding out how
3046       much memory is needed for given substitution. However, this  does  mean
3047       that the entire operation is carried out twice. Depending on the appli‐
3048       cation, it may be more efficient to allocate a large  buffer  and  free
3049       the   excess   afterwards,   instead  of  using  PCRE2_SUBSTITUTE_OVER‐
3050       FLOW_LENGTH.
3051
3052       PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that
3053       do not appear in the pattern to be treated as unset groups. This option
3054       should be used with care, because it means that a typo in a group  name
3055       or number no longer causes the PCRE2_ERROR_NOSUBSTRING error.
3056
3057       PCRE2_SUBSTITUTE_UNSET_EMPTY  causes  unset  capture  groups (including
3058       unknown  groups  when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)  to  be
3059       treated  as  empty  strings  when  inserted as described above. If this
3060       option is not set, an attempt to  insert  an  unset  group  causes  the
3061       PCRE2_ERROR_UNSET  error.  This  option does not influence the extended
3062       substitution syntax described below.
3063
3064       PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to  the
3065       replacement  string.  Without this option, only the dollar character is
3066       special, and only the group insertion forms  listed  above  are  valid.
3067       When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
3068
3069       Firstly,  backslash in a replacement string is interpreted as an escape
3070       character. The usual forms such as \n or \x{ddd} can be used to specify
3071       particular  character codes, and backslash followed by any non-alphanu‐
3072       meric character quotes that character. Extended quoting  can  be  coded
3073       using \Q...\E, exactly as in pattern strings.
3074
3075       There  are  also four escape sequences for forcing the case of inserted
3076       letters.  The insertion mechanism has three states:  no  case  forcing,
3077       force upper case, and force lower case. The escape sequences change the
3078       current state: \U and \L change to upper or lower case forcing, respec‐
3079       tively,  and  \E (when not terminating a \Q quoted sequence) reverts to
3080       no case forcing. The sequences \u and \l force the next  character  (if
3081       it  is  a  letter)  to  upper or lower case, respectively, and then the
3082       state automatically reverts to no case forcing. Case forcing applies to
3083       all  inserted  characters, including those from capture groups and let‐
3084       ters within \Q...\E quoted sequences.
3085
3086       Note that case forcing sequences such as \U...\E do not nest. For exam‐
3087       ple,  the  result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
3088       \E  has  no   effect.   Note   also   that   the   PCRE2_ALT_BSUX   and
3089       PCRE2_EXTRA_ALT_BSUX  options  do not apply to not apply to replacement
3090       strings.
3091
3092       The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to  add  more
3093       flexibility  to  capture  group  substitution. The syntax is similar to
3094       that used by Bash:
3095
3096         ${<n>:-<string>}
3097         ${<n>:+<string1>:<string2>}
3098
3099       As before, <n> may be a group number or a name. The first  form  speci‐
3100       fies  a  default  value. If group <n> is set, its value is inserted; if
3101       not, <string> is expanded and the  result  inserted.  The  second  form
3102       specifies  strings that are expanded and inserted when group <n> is set
3103       or unset, respectively. The first form is just a  convenient  shorthand
3104       for
3105
3106         ${<n>:+${<n>}:<string>}
3107
3108       Backslash  can  be  used to escape colons and closing curly brackets in
3109       the replacement strings. A change of the case forcing  state  within  a
3110       replacement  string  remains  in  force  afterwards,  as  shown in this
3111       pcre2test example:
3112
3113         /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
3114             body
3115          1: hello
3116             somebody
3117          1: HELLO
3118
3119       The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these  extended
3120       substitutions.   However,   PCRE2_SUBSTITUTE_UNKNOWN_UNSET  does  cause
3121       unknown groups in the extended syntax forms to be treated as unset.
3122
3123       If successful, pcre2_substitute()  returns  the  number  of  successful
3124       matches.  This  may  be  zero  if  no  matches were found, and is never
3125       greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
3126
3127       In the event of an error, a negative error code is returned. Except for
3128       PCRE2_ERROR_NOMATCH    (which   is   never   returned),   errors   from
3129       pcre2_match() are passed straight back.
3130
3131       PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser‐
3132       tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
3133
3134       PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ‐
3135       ing an unknown substring when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)
3136       when  the  simple  (non-extended)  syntax  is  used  and  PCRE2_SUBSTI‐
3137       TUTE_UNSET_EMPTY is not set.
3138
3139       PCRE2_ERROR_NOMEMORY is returned  if  the  output  buffer  is  not  big
3140       enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size
3141       of buffer that is needed is returned via outlengthptr. Note  that  this
3142       does not happen by default.
3143
3144       PCRE2_ERROR_BADREPLACEMENT  is  used for miscellaneous syntax errors in
3145       the   replacement   string,   with   more   particular   errors   being
3146       PCRE2_ERROR_BADREPESCAPE  (invalid  escape  sequence), PCRE2_ERROR_REP‐
3147       MISSINGBRACE (closing curly bracket not found),  PCRE2_ERROR_BADSUBSTI‐
3148       TUTION   (syntax   error   in   extended   group   substitution),   and
3149       PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before  it  started
3150       or  the match started earlier than the current position in the subject,
3151       which can happen if \K is used in an assertion).
3152
3153       As for all PCRE2 errors, a text message that describes the error can be
3154       obtained   by   calling  the  pcre2_get_error_message()  function  (see
3155       "Obtaining a textual error message" above).
3156
3157   Substitution callouts
3158
3159       int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
3160         int (*callout_function)(pcre2_substitute_callout_block *, void *),
3161         void *callout_data);
3162
3163       The pcre2_set_substitution_callout() function can be used to specify  a
3164       callout  function for pcre2_substitute(). This information is passed in
3165       a match context. The callout function is called after each substitution
3166       has been processed, but it can cause the replacement not to happen. The
3167       callout function is not called for simulated substitutions that  happen
3168       as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option.
3169
3170       The first argument of the callout function is a pointer to a substitute
3171       callout block structure, which contains the following fields, not  nec‐
3172       essarily in this order:
3173
3174         uint32_t    version;
3175         uint32_t    subscount;
3176         PCRE2_SPTR  input;
3177         PCRE2_SPTR  output;
3178         PCRE2_SIZE *ovector;
3179         uint32_t    oveccount;
3180         PCRE2_SIZE  output_offsets[2];
3181
3182       The  version field contains the version number of the block format. The
3183       current version is 0. The version number will  increase  in  future  if
3184       more  fields are added, but the intention is never to remove any of the
3185       existing fields.
3186
3187       The subscount field is the number of the current match. It is 1 for the
3188       first callout, 2 for the second, and so on. The input and output point‐
3189       ers are copies of the values passed to pcre2_substitute().
3190
3191       The ovector field points to the ovector, which contains the  result  of
3192       the most recent match. The oveccount field contains the number of pairs
3193       that are set in the ovector, and is always greater than zero.
3194
3195       The output_offsets vector contains the offsets of  the  replacement  in
3196       the  output  string. This has already been processed for dollar and (if
3197       requested) backslash substitutions as described above.
3198
3199       The second argument of the callout function  is  the  value  passed  as
3200       callout_data  when  the  function was registered. The value returned by
3201       the callout function is interpreted as follows:
3202
3203       If the value is zero, the replacement is accepted, and,  if  PCRE2_SUB‐
3204       STITUTE_GLOBAL  is set, processing continues with a search for the next
3205       match. If the value  is  not  zero,  the  current  replacement  is  not
3206       accepted.  If the value is greater than zero, processing continues when
3207       PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than  zero
3208       or  PCRE2_SUBSTITUTE_GLOBAL  is  not set), the the rest of the input is
3209       copied to the output and the call to pcre2_substitute() exits,  return‐
3210       ing the number of matches so far.
3211

DUPLICATE CAPTURE GROUP NAMES

3213
3214       int pcre2_substring_nametable_scan(const pcre2_code *code,
3215         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
3216
3217       When  a  pattern  is compiled with the PCRE2_DUPNAMES option, names for
3218       capture groups are not required  to  be  unique.  Duplicate  names  are
3219       always  allowed  for  groups with the same number, created by using the
3220       (?| feature. Indeed, if such groups are named, they are required to use
3221       the same names.
3222
3223       Normally,  patterns  that  use duplicate names are such that in any one
3224       match, only one of each set of identically-named  groups  participates.
3225       An example is shown in the pcre2pattern documentation.
3226
3227       When   duplicates   are   present,   pcre2_substring_copy_byname()  and
3228       pcre2_substring_get_byname() return the first  substring  corresponding
3229       to   the   given   name   that   is  set.  Only  if  none  are  set  is
3230       PCRE2_ERROR_UNSET is returned.  The  pcre2_substring_number_from_name()
3231       function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
3232       duplicate names.
3233
3234       If you want to get full details of all captured substrings for a  given
3235       name,  you  must use the pcre2_substring_nametable_scan() function. The
3236       first argument is the compiled pattern, and the second is the name.  If
3237       the  third  and fourth arguments are NULL, the function returns a group
3238       number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
3239
3240       When the third and fourth arguments are not NULL, they must be pointers
3241       to  variables  that are updated by the function. After it has run, they
3242       point to the first and last entries in the name-to-number table for the
3243       given  name,  and the function returns the length of each entry in code
3244       units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there  are
3245       no entries for the given name.
3246
3247       The format of the name table is described above in the section entitled
3248       Information about a pattern. Given all the  relevant  entries  for  the
3249       name,  you  can  extract  each of their numbers, and hence the captured
3250       data.
3251

FINDING ALL POSSIBLE MATCHES AT ONE POSITION

3253
3254       The traditional matching function uses a  similar  algorithm  to  Perl,
3255       which  stops when it finds the first match at a given point in the sub‐
3256       ject. If you want to find all possible matches, or the longest possible
3257       match  at  a  given  position,  consider using the alternative matching
3258       function (see below) instead. If you cannot use the  alternative  func‐
3259       tion, you can kludge it up by making use of the callout facility, which
3260       is described in the pcre2callout documentation.
3261
3262       What you have to do is to insert a callout right at the end of the pat‐
3263       tern.   When your callout function is called, extract and save the cur‐
3264       rent matched substring. Then return 1, which  forces  pcre2_match()  to
3265       backtrack  and  try other alternatives. Ultimately, when it runs out of
3266       matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
3267

MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

3269
3270       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
3271         PCRE2_SIZE length, PCRE2_SIZE startoffset,
3272         uint32_t options, pcre2_match_data *match_data,
3273         pcre2_match_context *mcontext,
3274         int *workspace, PCRE2_SIZE wscount);
3275
3276       The function pcre2_dfa_match() is called  to  match  a  subject  string
3277       against  a  compiled pattern, using a matching algorithm that scans the
3278       subject string just once (not counting lookaround assertions), and does
3279       not  backtrack.  This has different characteristics to the normal algo‐
3280       rithm, and is not compatible with Perl. Some of the features  of  PCRE2
3281       patterns  are  not  supported.  Nevertheless, there are times when this
3282       kind of matching can be useful. For a discussion of  the  two  matching
3283       algorithms, and a list of features that pcre2_dfa_match() does not sup‐
3284       port, see the pcre2matching documentation.
3285
3286       The arguments for the pcre2_dfa_match() function are the  same  as  for
3287       pcre2_match(), plus two extras. The ovector within the match data block
3288       is used in a different way, and this is described below. The other com‐
3289       mon  arguments  are used in the same way as for pcre2_match(), so their
3290       description is not repeated here.
3291
3292       The two additional arguments provide workspace for  the  function.  The
3293       workspace  vector  should  contain at least 20 elements. It is used for
3294       keeping  track  of  multiple  paths  through  the  pattern  tree.  More
3295       workspace  is needed for patterns and subjects where there are a lot of
3296       potential matches.
3297
3298       Here is an example of a simple call to pcre2_dfa_match():
3299
3300         int wspace[20];
3301         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
3302         int rc = pcre2_dfa_match(
3303           re,             /* result of pcre2_compile() */
3304           "some string",  /* the subject string */
3305           11,             /* the length of the subject string */
3306           0,              /* start at offset 0 in the subject */
3307           0,              /* default options */
3308           md,             /* the match data block */
3309           NULL,           /* a match context; NULL means use defaults */
3310           wspace,         /* working space vector */
3311           20);            /* number of elements (NOT size in bytes) */
3312
3313   Option bits for pcre_dfa_match()
3314
3315       The unused bits of the options argument for pcre2_dfa_match()  must  be
3316       zero.   The   only   bits   that   may   be   set  are  PCRE2_ANCHORED,
3317       PCRE2_COPY_MATCHED_SUBJECT,      PCRE2_ENDANCHORED,       PCRE2_NOTBOL,
3318       PCRE2_NOTEOL,          PCRE2_NOTEMPTY,          PCRE2_NOTEMPTY_ATSTART,
3319       PCRE2_NO_UTF_CHECK,       PCRE2_PARTIAL_HARD,       PCRE2_PARTIAL_SOFT,
3320       PCRE2_DFA_SHORTEST,  and  PCRE2_DFA_RESTART.  All  but the last four of
3321       these are exactly the same as for pcre2_match(), so  their  description
3322       is not repeated here.
3323
3324         PCRE2_PARTIAL_HARD
3325         PCRE2_PARTIAL_SOFT
3326
3327       These  have  the  same general effect as they do for pcre2_match(), but
3328       the details are slightly different. When PCRE2_PARTIAL_HARD is set  for
3329       pcre2_dfa_match(),  it  returns  PCRE2_ERROR_PARTIAL  if the end of the
3330       subject is reached and there is still at least one matching possibility
3331       that requires additional characters. This happens even if some complete
3332       matches have already been found. When PCRE2_PARTIAL_SOFT  is  set,  the
3333       return  code  PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
3334       if the end of the subject is  reached,  there  have  been  no  complete
3335       matches, but there is still at least one matching possibility. The por‐
3336       tion of the string that was inspected when the  longest  partial  match
3337       was found is set as the first matching string in both cases. There is a
3338       more detailed discussion of partial and  multi-segment  matching,  with
3339       examples, in the pcre2partial documentation.
3340
3341         PCRE2_DFA_SHORTEST
3342
3343       Setting  the PCRE2_DFA_SHORTEST option causes the matching algorithm to
3344       stop as soon as it has found one match. Because of the way the alterna‐
3345       tive  algorithm  works, this is necessarily the shortest possible match
3346       at the first possible matching point in the subject string.
3347
3348         PCRE2_DFA_RESTART
3349
3350       When pcre2_dfa_match() returns a partial match, it is possible to  call
3351       it again, with additional subject characters, and have it continue with
3352       the same match. The PCRE2_DFA_RESTART option requests this action; when
3353       it  is  set,  the workspace and wscount options must reference the same
3354       vector as before because data about the match so far is  left  in  them
3355       after a partial match. There is more discussion of this facility in the
3356       pcre2partial documentation.
3357
3358   Successful returns from pcre2_dfa_match()
3359
3360       When pcre2_dfa_match() succeeds, it may have matched more than one sub‐
3361       string in the subject. Note, however, that all the matches from one run
3362       of the function start at the same point in  the  subject.  The  shorter
3363       matches  are all initial substrings of the longer matches. For example,
3364       if the pattern
3365
3366         <.*>
3367
3368       is matched against the string
3369
3370         This is <something> <something else> <something further> no more
3371
3372       the three matched strings are
3373
3374         <something> <something else> <something further>
3375         <something> <something else>
3376         <something>
3377
3378       On success, the yield of the function is a number  greater  than  zero,
3379       which  is  the  number  of  matched substrings. The offsets of the sub‐
3380       strings are returned in the ovector, and can be extracted by number  in
3381       the  same way as for pcre2_match(), but the numbers bear no relation to
3382       any capture groups that may exist in the pattern, because DFA  matching
3383       does not support capturing.
3384
3385       Calls  to  the  convenience  functions  that extract substrings by name
3386       return the error PCRE2_ERROR_DFA_UFUNC (unsupported function)  if  used
3387       after a DFA match. The convenience functions that extract substrings by
3388       number never return PCRE2_ERROR_NOSUBSTRING.
3389
3390       The matched strings are stored in  the  ovector  in  reverse  order  of
3391       length;  that  is,  the longest matching string is first. If there were
3392       too many matches to fit into the ovector, the yield of the function  is
3393       zero, and the vector is filled with the longest matches.
3394
3395       NOTE:  PCRE2's  "auto-possessification" optimization usually applies to
3396       character repeats at the end of a pattern (as well as internally).  For
3397       example,  the pattern "a\d+" is compiled as if it were "a\d++". For DFA
3398       matching, this means that only one possible  match  is  found.  If  you
3399       really  do  want multiple matches in such cases, either use an ungreedy
3400       repeat such as "a\d+?" or set  the  PCRE2_NO_AUTO_POSSESS  option  when
3401       compiling.
3402
3403   Error returns from pcre2_dfa_match()
3404
3405       The pcre2_dfa_match() function returns a negative number when it fails.
3406       Many of the errors are the same  as  for  pcre2_match(),  as  described
3407       above.  There are in addition the following errors that are specific to
3408       pcre2_dfa_match():
3409
3410         PCRE2_ERROR_DFA_UITEM
3411
3412       This return is given if pcre2_dfa_match() encounters  an  item  in  the
3413       pattern  that it does not support, for instance, the use of \C in a UTF
3414       mode or a backreference.
3415
3416         PCRE2_ERROR_DFA_UCOND
3417
3418       This return is given if pcre2_dfa_match() encounters a  condition  item
3419       that uses a backreference for the condition, or a test for recursion in
3420       a specific capture group. These are not supported.
3421
3422         PCRE2_ERROR_DFA_WSSIZE
3423
3424       This return is given if pcre2_dfa_match() runs  out  of  space  in  the
3425       workspace vector.
3426
3427         PCRE2_ERROR_DFA_RECURSE
3428
3429       When a recursion or subroutine call is processed, the matching function
3430       calls itself recursively, using private  memory  for  the  ovector  and
3431       workspace.   This  error  is given if the internal ovector is not large
3432       enough. This should be extremely rare, as a  vector  of  size  1000  is
3433       used.
3434
3435         PCRE2_ERROR_DFA_BADRESTART
3436
3437       When  pcre2_dfa_match()  is  called  with the PCRE2_DFA_RESTART option,
3438       some plausibility checks are made on the  contents  of  the  workspace,
3439       which  should  contain data about the previous partial match. If any of
3440       these checks fail, this error is given.
3441

SEE ALSO

3443
3444       pcre2build(3),   pcre2callout(3),    pcre2demo(3),    pcre2matching(3),
3445       pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).
3446

AUTHOR

3448
3449       Philip Hazel
3450       University Computing Service
3451       Cambridge, England.
3452

REVISION

3454
3455       Last updated: 14 February 2019
3456       Copyright (c) 1997-2019 University of Cambridge.
3457
3458
3459
3460PCRE2 10.33                    14 February 2019                    PCRE2API(3)
Impressum