1PCRE2API(3)                Library Functions Manual                PCRE2API(3)
2
3
4

NAME

6       PCRE2 - Perl-compatible regular expressions (revised API)
7
8       #include <pcre2.h>
9
10       PCRE2  is  a  new API for PCRE, starting at release 10.0. This document
11       contains a description of all its native functions. See the pcre2 docu‐
12       ment for an overview of all the PCRE2 documentation.
13

PCRE2 NATIVE API BASIC FUNCTIONS

15
16       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
17         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
18         pcre2_compile_context *ccontext);
19
20       void pcre2_code_free(pcre2_code *code);
21
22       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
23         pcre2_general_context *gcontext);
24
25       pcre2_match_data *pcre2_match_data_create_from_pattern(
26         const pcre2_code *code, pcre2_general_context *gcontext);
27
28       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
29         PCRE2_SIZE length, PCRE2_SIZE startoffset,
30         uint32_t options, pcre2_match_data *match_data,
31         pcre2_match_context *mcontext);
32
33       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
34         PCRE2_SIZE length, PCRE2_SIZE startoffset,
35         uint32_t options, pcre2_match_data *match_data,
36         pcre2_match_context *mcontext,
37         int *workspace, PCRE2_SIZE wscount);
38
39       void pcre2_match_data_free(pcre2_match_data *match_data);
40

PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS

42
43       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
44
45       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
46
47       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
48
49       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
50

PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS

52
53       pcre2_general_context *pcre2_general_context_create(
54         void *(*private_malloc)(PCRE2_SIZE, void *),
55         void (*private_free)(void *, void *), void *memory_data);
56
57       pcre2_general_context *pcre2_general_context_copy(
58         pcre2_general_context *gcontext);
59
60       void pcre2_general_context_free(pcre2_general_context *gcontext);
61

PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS

63
64       pcre2_compile_context *pcre2_compile_context_create(
65         pcre2_general_context *gcontext);
66
67       pcre2_compile_context *pcre2_compile_context_copy(
68         pcre2_compile_context *ccontext);
69
70       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
71
72       int pcre2_set_bsr(pcre2_compile_context *ccontext,
73         uint32_t value);
74
75       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
76         const unsigned char *tables);
77
78       int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
79         uint32_t extra_options);
80
81       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
82         PCRE2_SIZE value);
83
84       int pcre2_set_newline(pcre2_compile_context *ccontext,
85         uint32_t value);
86
87       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
88         uint32_t value);
89
90       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
91         int (*guard_function)(uint32_t, void *), void *user_data);
92

PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

94
95       pcre2_match_context *pcre2_match_context_create(
96         pcre2_general_context *gcontext);
97
98       pcre2_match_context *pcre2_match_context_copy(
99         pcre2_match_context *mcontext);
100
101       void pcre2_match_context_free(pcre2_match_context *mcontext);
102
103       int pcre2_set_callout(pcre2_match_context *mcontext,
104         int (*callout_function)(pcre2_callout_block *, void *),
105         void *callout_data);
106
107       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
108         PCRE2_SIZE value);
109
110       int pcre2_set_heap_limit(pcre2_match_context *mcontext,
111         uint32_t value);
112
113       int pcre2_set_match_limit(pcre2_match_context *mcontext,
114         uint32_t value);
115
116       int pcre2_set_depth_limit(pcre2_match_context *mcontext,
117         uint32_t value);
118

PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS

120
121       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
122         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
123
124       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
125         uint32_t number, PCRE2_UCHAR *buffer,
126         PCRE2_SIZE *bufflen);
127
128       void pcre2_substring_free(PCRE2_UCHAR *buffer);
129
130       int pcre2_substring_get_byname(pcre2_match_data *match_data,
131         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
132
133       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
134         uint32_t number, PCRE2_UCHAR **bufferptr,
135         PCRE2_SIZE *bufflen);
136
137       int pcre2_substring_length_byname(pcre2_match_data *match_data,
138         PCRE2_SPTR name, PCRE2_SIZE *length);
139
140       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
141         uint32_t number, PCRE2_SIZE *length);
142
143       int pcre2_substring_nametable_scan(const pcre2_code *code,
144         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
145
146       int pcre2_substring_number_from_name(const pcre2_code *code,
147         PCRE2_SPTR name);
148
149       void pcre2_substring_list_free(PCRE2_SPTR *list);
150
151       int pcre2_substring_list_get(pcre2_match_data *match_data,
152         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
153

PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION

155
156       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
157         PCRE2_SIZE length, PCRE2_SIZE startoffset,
158         uint32_t options, pcre2_match_data *match_data,
159         pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
160         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
161         PCRE2_SIZE *outlengthptr);
162

PCRE2 NATIVE API JIT FUNCTIONS

164
165       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
166
167       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
168         PCRE2_SIZE length, PCRE2_SIZE startoffset,
169         uint32_t options, pcre2_match_data *match_data,
170         pcre2_match_context *mcontext);
171
172       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
173
174       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
175         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
176
177       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
178         pcre2_jit_callback callback_function, void *callback_data);
179
180       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
181

PCRE2 NATIVE API SERIALIZATION FUNCTIONS

183
184       int32_t pcre2_serialize_decode(pcre2_code **codes,
185         int32_t number_of_codes, const uint8_t *bytes,
186         pcre2_general_context *gcontext);
187
188       int32_t pcre2_serialize_encode(const pcre2_code **codes,
189         int32_t number_of_codes, uint8_t **serialized_bytes,
190         PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
191
192       void pcre2_serialize_free(uint8_t *bytes);
193
194       int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
195

PCRE2 NATIVE API AUXILIARY FUNCTIONS

197
198       pcre2_code *pcre2_code_copy(const pcre2_code *code);
199
200       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
201
202       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
203         PCRE2_SIZE bufflen);
204
205       const unsigned char *pcre2_maketables(pcre2_general_context *gcontext);
206
207       int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
208
209       int pcre2_callout_enumerate(const pcre2_code *code,
210         int (*callback)(pcre2_callout_enumerate_block *, void *),
211         void *user_data);
212
213       int pcre2_config(uint32_t what, void *where);
214

PCRE2 NATIVE API OBSOLETE FUNCTIONS

216
217       int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
218         uint32_t value);
219
220       int pcre2_set_recursion_memory_management(
221         pcre2_match_context *mcontext,
222         void *(*private_malloc)(PCRE2_SIZE, void *),
223         void (*private_free)(void *, void *), void *memory_data);
224
225       These  functions became obsolete at release 10.30 and are retained only
226       for backward compatibility. They should not be used in  new  code.  The
227       first  is  replaced by pcre2_set_depth_limit(); the second is no longer
228       needed and has no effect (it always returns zero).
229

PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS

231
232       pcre2_convert_context *pcre2_convert_context_create(
233         pcre2_general_context *gcontext);
234
235       pcre2_convert_context *pcre2_convert_context_copy(
236         pcre2_convert_context *cvcontext);
237
238       void pcre2_convert_context_free(pcre2_convert_context *cvcontext);
239
240       int pcre2_set_glob_escape(pcre2_convert_context *cvcontext,
241         uint32_t escape_char);
242
243       int pcre2_set_glob_separator(pcre2_convert_context *cvcontext,
244         uint32_t separator_char);
245
246       int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length,
247         uint32_t options, PCRE2_UCHAR **buffer,
248         PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);
249
250       void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);
251
252       These functions provide a way of  converting  non-PCRE2  patterns  into
253       patterns  that  can  be  processed by pcre2_compile(). This facility is
254       experimental and may be changed in future releases. At present, "globs"
255       and  POSIX  basic  and  extended patterns can be converted. Details are
256       given in the pcre2convert documentation.
257

PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES

259
260       There are three PCRE2 libraries, supporting 8-bit, 16-bit,  and  32-bit
261       code  units,  respectively.  However,  there  is  just one header file,
262       pcre2.h.  This contains the function prototypes and  other  definitions
263       for all three libraries. One, two, or all three can be installed simul‐
264       taneously. On Unix-like systems the libraries  are  called  libpcre2-8,
265       libpcre2-16, and libpcre2-32, and they can also co-exist with the orig‐
266       inal PCRE libraries.
267
268       Character strings are passed to and from a PCRE2 library as a  sequence
269       of  unsigned  integers  in  code  units of the appropriate width. Every
270       PCRE2 function comes in three different forms, one  for  each  library,
271       for example:
272
273         pcre2_compile_8()
274         pcre2_compile_16()
275         pcre2_compile_32()
276
277       There are also three different sets of data types:
278
279         PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
280         PCRE2_SPTR8,  PCRE2_SPTR16,  PCRE2_SPTR32
281
282       The  UCHAR  types define unsigned code units of the appropriate widths.
283       For example, PCRE2_UCHAR16 is usually defined as `uint16_t'.  The  SPTR
284       types  are  constant  pointers  to the equivalent UCHAR types, that is,
285       they are pointers to vectors of unsigned code units.
286
287       Many applications use only one code unit width. For their  convenience,
288       macros are defined whose names are the generic forms such as pcre2_com‐
289       pile() and  PCRE2_SPTR.  These  macros  use  the  value  of  the  macro
290       PCRE2_CODE_UNIT_WIDTH  to generate the appropriate width-specific func‐
291       tion and macro names.  PCRE2_CODE_UNIT_WIDTH is not defined by default.
292       An  application  must  define  it  to  be 8, 16, or 32 before including
293       pcre2.h in order to make use of the generic names.
294
295       Applications that use more than one code unit width can be linked  with
296       more  than  one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to
297       be 0 before including pcre2.h, and then use the  real  function  names.
298       Any  code  that  is to be included in an environment where the value of
299       PCRE2_CODE_UNIT_WIDTH is unknown should  also  use  the  real  function
300       names. (Unfortunately, it is not possible in C code to save and restore
301       the value of a macro.)
302
303       If PCRE2_CODE_UNIT_WIDTH is not defined  before  including  pcre2.h,  a
304       compiler error occurs.
305
306       When  using  multiple  libraries  in an application, you must take care
307       when processing any particular pattern to use  only  functions  from  a
308       single  library.   For example, if you want to run a match using a pat‐
309       tern that was compiled with pcre2_compile_16(), you  must  do  so  with
310       pcre2_match_16(), not pcre2_match_8() or pcre2_match_32().
311
312       In  the  function summaries above, and in the rest of this document and
313       other PCRE2 documents, functions and data  types  are  described  using
314       their generic names, without the _8, _16, or _32 suffix.
315

PCRE2 API OVERVIEW

317
318       PCRE2  has  its  own  native  API, which is described in this document.
319       There are also some wrapper functions for the 8-bit library that corre‐
320       spond  to the POSIX regular expression API, but they do not give access
321       to all the functionality of PCRE2. They are described in the pcre2posix
322       documentation. Both these APIs define a set of C function calls.
323
324       The  native  API  C data types, function prototypes, option values, and
325       error codes are defined in the header file pcre2.h, which also contains
326       definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
327       numbers for the library. Applications can use these to include  support
328       for different releases of PCRE2.
329
330       In a Windows environment, if you want to statically link an application
331       program against a non-dll PCRE2 library, you must  define  PCRE2_STATIC
332       before including pcre2.h.
333
334       The  functions pcre2_compile() and pcre2_match() are used for compiling
335       and matching regular expressions in a Perl-compatible manner. A  sample
336       program that demonstrates the simplest way of using them is provided in
337       the file called pcre2demo.c in the PCRE2 source distribution. A listing
338       of  this  program  is  given  in  the  pcre2demo documentation, and the
339       pcre2sample documentation describes how to compile and run it.
340
341       The compiling and matching functions recognize various options that are
342       passed as bits in an options argument. There are also some more compli‐
343       cated  parameters  such  as  custom  memory  management  functions  and
344       resource  limits  that  are passed in "contexts" (which are just memory
345       blocks, described below). Simple applications do not need to  make  use
346       of contexts.
347
348       Just-in-time  (JIT)  compiler  support  is an optional feature of PCRE2
349       that can be built in  appropriate  hardware  environments.  It  greatly
350       speeds  up  the  matching  performance  of  many patterns. Programs can
351       request that it be used if  available  by  calling  pcre2_jit_compile()
352       after a pattern has been successfully compiled by pcre2_compile(). This
353       does nothing if JIT support is not available.
354
355       More complicated programs might need to  make  use  of  the  specialist
356       functions    pcre2_jit_stack_create(),    pcre2_jit_stack_free(),   and
357       pcre2_jit_stack_assign() in order to  control  the  JIT  code's  memory
358       usage.
359
360       JIT matching is automatically used by pcre2_match() if it is available,
361       unless the PCRE2_NO_JIT option is set. There is also a direct interface
362       for  JIT  matching,  which gives improved performance at the expense of
363       less sanity checking. The JIT-specific functions are discussed  in  the
364       pcre2jit documentation.
365
366       A  second  matching function, pcre2_dfa_match(), which is not Perl-com‐
367       patible, is also provided. This uses  a  different  algorithm  for  the
368       matching.  The  alternative  algorithm finds all possible matches (at a
369       given point in the subject), and scans the subject  just  once  (unless
370       there  are  lookaround  assertions).  However,  this algorithm does not
371       return captured substrings. A description of  the  two  matching  algo‐
372       rithms   and  their  advantages  and  disadvantages  is  given  in  the
373       pcre2matching   documentation.   There   is   no   JIT   support    for
374       pcre2_dfa_match().
375
376       In  addition  to  the  main compiling and matching functions, there are
377       convenience functions for extracting captured substrings from a subject
378       string that has been matched by pcre2_match(). They are:
379
380         pcre2_substring_copy_byname()
381         pcre2_substring_copy_bynumber()
382         pcre2_substring_get_byname()
383         pcre2_substring_get_bynumber()
384         pcre2_substring_list_get()
385         pcre2_substring_length_byname()
386         pcre2_substring_length_bynumber()
387         pcre2_substring_nametable_scan()
388         pcre2_substring_number_from_name()
389
390       pcre2_substring_free()  and  pcre2_substring_list_free()  are also pro‐
391       vided, to free memory used for extracted strings. If  either  of  these
392       functions  is called with a NULL argument, the function returns immedi‐
393       ately without doing anything.
394
395       The function pcre2_substitute() can be called to match  a  pattern  and
396       return  a  copy of the subject string with substitutions for parts that
397       were matched.
398
399       Functions whose names begin with pcre2_serialize_ are used  for  saving
400       compiled patterns on disc or elsewhere, and reloading them later.
401
402       Finally,  there  are functions for finding out information about a com‐
403       piled pattern (pcre2_pattern_info()) and about the  configuration  with
404       which PCRE2 was built (pcre2_config()).
405
406       Functions  with  names  ending with _free() are used for freeing memory
407       blocks of various sorts. In all cases, if one  of  these  functions  is
408       called with a NULL argument, it does nothing.
409

STRING LENGTHS AND OFFSETS

411
412       The  PCRE2  API  uses  string  lengths and offsets into strings of code
413       units in several places. These values are always  of  type  PCRE2_SIZE,
414       which  is an unsigned integer type, currently always defined as size_t.
415       The largest  value  that  can  be  stored  in  such  a  type  (that  is
416       ~(PCRE2_SIZE)0)  is reserved as a special indicator for zero-terminated
417       strings and unset offsets.  Therefore, the longest string that  can  be
418       handled is one less than this maximum.
419

NEWLINES

421
422       PCRE2 supports five different conventions for indicating line breaks in
423       strings: a single CR (carriage return) character, a  single  LF  (line‐
424       feed) character, the two-character sequence CRLF, any of the three pre‐
425       ceding, or any Unicode newline sequence. The Unicode newline  sequences
426       are  the  three just mentioned, plus the single characters VT (vertical
427       tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line
428       separator, U+2028), and PS (paragraph separator, U+2029).
429
430       Each  of  the first three conventions is used by at least one operating
431       system as its standard newline sequence. When PCRE2 is built, a default
432       can be specified.  If it is not, the default is set to LF, which is the
433       Unix standard. However, the newline convention can  be  changed  by  an
434       application  when  calling  pcre2_compile(),  or it can be specified by
435       special text at the start of the pattern  itself;  this  overrides  any
436       other  settings.  See  the pcre2pattern page for details of the special
437       character sequences.
438
439       In the PCRE2 documentation the word "newline"  is  used  to  mean  "the
440       character or pair of characters that indicate a line break". The choice
441       of newline convention affects the handling of the dot, circumflex,  and
442       dollar metacharacters, the handling of #-comments in /x mode, and, when
443       CRLF is a recognized line ending sequence, the match position  advance‐
444       ment for a non-anchored pattern. There is more detail about this in the
445       section on pcre2_match() options below.
446
447       The choice of newline convention does not affect the interpretation  of
448       the \n or \r escape sequences, nor does it affect what \R matches; this
449       has its own separate convention.
450

MULTITHREADING

452
453       In a multithreaded application it is important to keep  thread-specific
454       data  separate  from data that can be shared between threads. The PCRE2
455       library code itself is thread-safe: it contains  no  static  or  global
456       variables.  The  API  is  designed to be fairly simple for non-threaded
457       applications while at the same time ensuring that multithreaded  appli‐
458       cations can use it.
459
460       There are several different blocks of data that are used to pass infor‐
461       mation between the application and the PCRE2 libraries.
462
463   The compiled pattern
464
465       A pointer to the compiled form of a pattern is  returned  to  the  user
466       when pcre2_compile() is successful. The data in the compiled pattern is
467       fixed, and does not change when the pattern is matched.  Therefore,  it
468       is  thread-safe, that is, the same compiled pattern can be used by more
469       than one thread simultaneously. For example, an application can compile
470       all its patterns at the start, before forking off multiple threads that
471       use them. However, if the just-in-time (JIT)  optimization  feature  is
472       being  used,  it needs separate memory stack areas for each thread. See
473       the pcre2jit documentation for more details.
474
475       In a more complicated situation, where patterns are compiled only  when
476       they  are  first needed, but are still shared between threads, pointers
477       to compiled patterns must be protected  from  simultaneous  writing  by
478       multiple threads, at least until a pattern has been compiled. The logic
479       can be something like this:
480
481         Get a read-only (shared) lock (mutex) for pointer
482         if (pointer == NULL)
483           {
484           Get a write (unique) lock for pointer
485           pointer = pcre2_compile(...
486           }
487         Release the lock
488         Use pointer in pcre2_match()
489
490       Of course, testing for compilation errors should also  be  included  in
491       the code.
492
493       If JIT is being used, but the JIT compilation is not being done immedi‐
494       ately, (perhaps waiting to see if the pattern  is  used  often  enough)
495       similar logic is required. JIT compilation updates a pointer within the
496       compiled code block, so a thread must gain unique write access  to  the
497       pointer     before    calling    pcre2_jit_compile().    Alternatively,
498       pcre2_code_copy()  or  pcre2_code_copy_with_tables()  can  be  used  to
499       obtain  a private copy of the compiled code before calling the JIT com‐
500       piler.
501
502   Context blocks
503
504       The next main section below introduces the idea of "contexts" in  which
505       PCRE2 functions are called. A context is nothing more than a collection
506       of parameters that control the way PCRE2 operates. Grouping a number of
507       parameters together in a context is a convenient way of passing them to
508       a PCRE2 function without using lots of arguments. The  parameters  that
509       are  stored  in  contexts  are in some sense "advanced features" of the
510       API. Many straightforward applications will not need to use contexts.
511
512       In a multithreaded application, if the parameters in a context are val‐
513       ues  that  are  never  changed, the same context can be used by all the
514       threads. However, if any thread needs to change any value in a context,
515       it must make its own thread-specific copy.
516
517   Match blocks
518
519       The  matching  functions need a block of memory for storing the results
520       of a match. This includes details of what was matched, as well as addi‐
521       tional  information  such as the name of a (*MARK) setting. Each thread
522       must provide its own copy of this memory.
523

PCRE2 CONTEXTS

525
526       Some PCRE2 functions have a lot of parameters, many of which  are  used
527       only  by  specialist  applications,  for example, those that use custom
528       memory management or non-standard character tables.  To  keep  function
529       argument  lists  at a reasonable size, and at the same time to keep the
530       API extensible, "uncommon" parameters are passed to  certain  functions
531       in  a  context instead of directly. A context is just a block of memory
532       that holds the parameter values.  Applications  that  do  not  need  to
533       adjust  any  of  the  context  parameters  can pass NULL when a context
534       pointer is required.
535
536       There are three different types of context: a general context  that  is
537       relevant  for  several  PCRE2 operations, a compile-time context, and a
538       match-time context.
539
540   The general context
541
542       At present, this context just  contains  pointers  to  (and  data  for)
543       external  memory  management  functions  that  are  called from several
544       places in the PCRE2 library. The context is named `general' rather than
545       specifically  `memory'  because in future other fields may be added. If
546       you do not want to supply your own custom memory management  functions,
547       you  do not need to bother with a general context. A general context is
548       created by:
549
550       pcre2_general_context *pcre2_general_context_create(
551         void *(*private_malloc)(PCRE2_SIZE, void *),
552         void (*private_free)(void *, void *), void *memory_data);
553
554       The two function pointers specify custom memory  management  functions,
555       whose prototypes are:
556
557         void *private_malloc(PCRE2_SIZE, void *);
558         void  private_free(void *, void *);
559
560       Whenever code in PCRE2 calls these functions, the final argument is the
561       value of memory_data. Either of the first two arguments of the creation
562       function  may be NULL, in which case the system memory management func‐
563       tions malloc() and free() are used. (This is not currently  useful,  as
564       there  are  no  other  fields in a general context, but in future there
565       might be.)  The private_malloc() function  is  used  (if  supplied)  to
566       obtain  memory  for storing the context, and all three values are saved
567       as part of the context.
568
569       Whenever PCRE2 creates a data block of any kind, the block  contains  a
570       pointer  to the free() function that matches the malloc() function that
571       was used. When the time comes to  free  the  block,  this  function  is
572       called.
573
574       A general context can be copied by calling:
575
576       pcre2_general_context *pcre2_general_context_copy(
577         pcre2_general_context *gcontext);
578
579       The memory used for a general context should be freed by calling:
580
581       void pcre2_general_context_free(pcre2_general_context *gcontext);
582
583       If  this  function  is  passed  a NULL argument, it returns immediately
584       without doing anything.
585
586   The compile context
587
588       A compile context is required if you want to provide an external  func‐
589       tion  for  stack  checking  during compilation or to change the default
590       values of any of the following compile-time parameters:
591
592         What \R matches (Unicode newlines or CR, LF, CRLF only)
593         PCRE2's character tables
594         The newline character sequence
595         The compile time nested parentheses limit
596         The maximum length of the pattern string
597         The extra options bits (none set by default)
598
599       A compile context is also required if you are using custom memory  man‐
600       agement.   If  none of these apply, just pass NULL as the context argu‐
601       ment of pcre2_compile().
602
603       A compile context is created, copied, and freed by the following  func‐
604       tions:
605
606       pcre2_compile_context *pcre2_compile_context_create(
607         pcre2_general_context *gcontext);
608
609       pcre2_compile_context *pcre2_compile_context_copy(
610         pcre2_compile_context *ccontext);
611
612       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
613
614       A  compile  context  is created with default values for its parameters.
615       These can be changed by calling the following functions, which return 0
616       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
617
618       int pcre2_set_bsr(pcre2_compile_context *ccontext,
619         uint32_t value);
620
621       The  value  must  be PCRE2_BSR_ANYCRLF, to specify that \R matches only
622       CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R  matches  any
623       Unicode line ending sequence. The value is used by the JIT compiler and
624       by  the  two  interpreted   matching   functions,   pcre2_match()   and
625       pcre2_dfa_match().
626
627       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
628         const unsigned char *tables);
629
630       The  value  must  be  the result of a call to pcre2_maketables(), whose
631       only argument is a general context. This function builds a set of char‐
632       acter tables in the current locale.
633
634       int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
635         uint32_t extra_options);
636
637       As  PCRE2  has developed, almost all the 32 option bits that are avail‐
638       able in the options argument of pcre2_compile() have been used  up.  To
639       avoid  running  out, the compile context contains a set of extra option
640       bits which are used for some newer, assumed rarer, options. This  func‐
641       tion  sets  those bits. It always sets all the bits (either on or off).
642       It does not modify any existing  setting.  The  available  options  are
643       defined in the section entitled "Extra compile options" below.
644
645       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
646         PCRE2_SIZE value);
647
648       This  sets a maximum length, in code units, for any pattern string that
649       is compiled with this context. If the pattern is longer,  an  error  is
650       generated.   This facility is provided so that applications that accept
651       patterns from external sources can limit their size. The default is the
652       largest  number  that  a  PCRE2_SIZE variable can hold, which is effec‐
653       tively unlimited.
654
655       int pcre2_set_newline(pcre2_compile_context *ccontext,
656         uint32_t value);
657
658       This specifies which characters or character sequences are to be recog‐
659       nized  as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
660       return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
661       two-character  sequence  CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
662       of the above), PCRE2_NEWLINE_ANY (any  Unicode  newline  sequence),  or
663       PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero).
664
665       A pattern can override the value set in the compile context by starting
666       with a sequence such as (*CRLF). See the pcre2pattern page for details.
667
668       When   a   pattern   is   compiled   with   the    PCRE2_EXTENDED    or
669       PCRE2_EXTENDED_MORE option, the newline convention affects the recogni‐
670       tion of the end of internal comments starting  with  #.  The  value  is
671       saved  with the compiled pattern for subsequent use by the JIT compiler
672       and by  the  two  interpreted  matching  functions,  pcre2_match()  and
673       pcre2_dfa_match().
674
675       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
676         uint32_t value);
677
678       This parameter ajusts the limit, set when PCRE2 is built (default 250),
679       on the depth of parenthesis nesting in  a  pattern.  This  limit  stops
680       rogue  patterns using up too much system stack when being compiled. The
681       limit applies to parentheses of all kinds, not just capturing parenthe‐
682       ses.
683
684       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
685         int (*guard_function)(uint32_t, void *), void *user_data);
686
687       There  is at least one application that runs PCRE2 in threads with very
688       limited system stack, where running out of stack is to  be  avoided  at
689       all  costs. The parenthesis limit above cannot take account of how much
690       stack is actually available during compilation. For  a  finer  control,
691       you  can  supply  a  function  that  is called whenever pcre2_compile()
692       starts to compile a parenthesized part of a pattern. This function  can
693       check  the  actual  stack  size  (or anything else that it wants to, of
694       course).
695
696       The first argument to the callout function gives the current  depth  of
697       nesting,  and  the second is user data that is set up by the last argu‐
698       ment  of  pcre2_set_compile_recursion_guard().  The  callout   function
699       should return zero if all is well, or non-zero to force an error.
700
701   The match context
702
703       A match context is required if you want to:
704
705         Set up a callout function
706         Set an offset limit for matching an unanchored pattern
707         Change the limit on the amount of heap used when matching
708         Change the backtracking match limit
709         Change the backtracking depth limit
710         Set custom memory management specifically for the match
711
712       If  none  of  these  apply,  just  pass NULL as the context argument of
713       pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().
714
715       A match context is created, copied, and freed by  the  following  func‐
716       tions:
717
718       pcre2_match_context *pcre2_match_context_create(
719         pcre2_general_context *gcontext);
720
721       pcre2_match_context *pcre2_match_context_copy(
722         pcre2_match_context *mcontext);
723
724       void pcre2_match_context_free(pcre2_match_context *mcontext);
725
726       A  match  context  is  created  with default values for its parameters.
727       These can be changed by calling the following functions, which return 0
728       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
729
730       int pcre2_set_callout(pcre2_match_context *mcontext,
731         int (*callout_function)(pcre2_callout_block *, void *),
732         void *callout_data);
733
734       This sets up a "callout" function for PCRE2 to call at specified points
735       during a matching operation. Details are given in the pcre2callout doc‐
736       umentation.
737
738       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
739         PCRE2_SIZE value);
740
741       The  offset_limit  parameter  limits  how  far an unanchored search can
742       advance in the subject string. The default value  is  PCRE2_UNSET.  The
743       pcre2_match()      and      pcre2_dfa_match()      functions     return
744       PCRE2_ERROR_NOMATCH if a match with a starting point before or  at  the
745       given  offset  is  not  found. The pcre2_substitute() function makes no
746       more substitutions.
747
748       For example, if the pattern /abc/ is matched against "123abc"  with  an
749       offset  limit  less than 3, the result is PCRE2_ERROR_NO_MATCH. A match
750       can never be  found  if  the  startoffset  argument  of  pcre2_match(),
751       pcre2_dfa_match(),  or  pcre2_substitute()  is  greater than the offset
752       limit set in the match context.
753
754       When using this  facility,  you  must  set  the  PCRE2_USE_OFFSET_LIMIT
755       option when calling pcre2_compile() so that when JIT is in use, differ‐
756       ent code can be compiled. If a match  is  started  with  a  non-default
757       match  limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is gener‐
758       ated.
759
760       The offset limit facility can be used to track progress when  searching
761       large  subject  strings or to limit the extent of global substitutions.
762       See also the PCRE2_FIRSTLINE option, which requires a  match  to  start
763       before  or  at  the first newline that follows the start of matching in
764       the subject. If this is set with an offset limit, a match must occur in
765       the first line and also within the offset limit. In other words, which‐
766       ever limit comes first is used.
767
768       int pcre2_set_heap_limit(pcre2_match_context *mcontext,
769         uint32_t value);
770
771       The heap_limit parameter specifies, in units of kibibytes (1024 bytes),
772       the  maximum  amount  of heap memory that pcre2_match() may use to hold
773       backtracking information when running an interpretive match. This limit
774       also applies to pcre2_dfa_match(), which may use the heap when process‐
775       ing patterns with a lot of nested pattern recursion or  lookarounds  or
776       atomic groups. This limit does not apply to matching with the JIT opti‐
777       mization, which has  its  own  memory  control  arrangements  (see  the
778       pcre2jit  documentation for more details). If the limit is reached, the
779       negative error code  PCRE2_ERROR_HEAPLIMIT  is  returned.  The  default
780       limit  can be set when PCRE2 is built; if it is not, the default is set
781       very large and is essentially "unlimited".
782
783       A value for the heap limit may also be supplied by an item at the start
784       of a pattern of the form
785
786         (*LIMIT_HEAP=ddd)
787
788       where  ddd  is  a  decimal  number.  However, such a setting is ignored
789       unless ddd is less than the limit set by the  caller  of  pcre2_match()
790       or, if no such limit is set, less than the default.
791
792       The  pcre2_match() function starts out using a 20KiB vector on the sys‐
793       tem stack for recording backtracking points. The more nested backtrack‐
794       ing  points  there  are (that is, the deeper the search tree), the more
795       memory is needed.  Heap memory is used only if the  initial  vector  is
796       too small. If the heap limit is set to a value less than 21 (in partic‐
797       ular, zero) no heap memory will be used. In this  case,  only  patterns
798       that  do not have a lot of nested backtracking can be successfully pro‐
799       cessed.
800
801       Similarly, for pcre2_dfa_match(), a vector on the system stack is  used
802       when  processing pattern recursions, lookarounds, or atomic groups, and
803       only if this is not big enough is heap memory used. In this case,  too,
804       setting a value of zero disables the use of the heap.
805
806       int pcre2_set_match_limit(pcre2_match_context *mcontext,
807         uint32_t value);
808
809       The  match_limit  parameter  provides  a means of preventing PCRE2 from
810       using up too many computing resources when processing patterns that are
811       not going to match, but which have a very large number of possibilities
812       in their search trees. The classic  example  is  a  pattern  that  uses
813       nested unlimited repeats.
814
815       There  is an internal counter in pcre2_match() that is incremented each
816       time round its main matching loop. If  this  value  reaches  the  match
817       limit, pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT.
818       This has the effect of limiting the amount  of  backtracking  that  can
819       take place. For patterns that are not anchored, the count restarts from
820       zero for each position in the subject string. This limit  also  applies
821       to pcre2_dfa_match(), though the counting is done in a different way.
822
823       When  pcre2_match() is called with a pattern that was successfully pro‐
824       cessed by pcre2_jit_compile(), the way in which matching is executed is
825       entirely  different. However, there is still the possibility of runaway
826       matching that goes on for a very long  time,  and  so  the  match_limit
827       value  is  also used in this case (but in a different way) to limit how
828       long the matching can continue.
829
830       The default value for the limit can be set when  PCRE2  is  built;  the
831       default  default  is 10 million, which handles all but the most extreme
832       cases. A value for the match limit may also be supplied by an  item  at
833       the start of a pattern of the form
834
835         (*LIMIT_MATCH=ddd)
836
837       where  ddd  is  a  decimal  number.  However, such a setting is ignored
838       unless ddd is less than the limit set by the caller of pcre2_match() or
839       pcre2_dfa_match() or, if no such limit is set, less than the default.
840
841       int pcre2_set_depth_limit(pcre2_match_context *mcontext,
842         uint32_t value);
843
844       This   parameter   limits   the   depth   of   nested  backtracking  in
845       pcre2_match().  Each time a nested backtracking point is passed, a  new
846       memory "frame" is used to remember the state of matching at that point.
847       Thus, this parameter indirectly limits the amount  of  memory  that  is
848       used  in  a  match.  However,  because  the size of each memory "frame"
849       depends on the number of capturing parentheses, the actual memory limit
850       varies  from pattern to pattern. This limit was more useful in versions
851       before 10.30, where function recursion was used for backtracking.
852
853       The depth limit is not relevant, and is ignored, when matching is  done
854       using JIT compiled code. However, it is supported by pcre2_dfa_match(),
855       which uses it to limit the depth of nested internal recursive  function
856       calls  that implement atomic groups, lookaround assertions, and pattern
857       recursions. This limits, indirectly, the amount of system stack that is
858       used.  It  was  more useful in versions before 10.32, when stack memory
859       was used for local workspace vectors for recursive function calls. From
860       version  10.32,  only local variables are allocated on the stack and as
861       each call uses only a few hundred bytes, even a small stack can support
862       quite a lot of recursion.
863
864       If  the  depth  of  internal  recursive function calls is great enough,
865       local workspace vectors are allocated on the heap  from  version  10.32
866       onwards,  so  the depth limit also indirectly limits the amount of heap
867       memory that is used. A recursive pattern such as /(.(?2))((?1)|)/, when
868       matched  to a very long string using pcre2_dfa_match(), can use a great
869       deal of memory. However, it is probably  better  to  limit  heap  usage
870       directly by calling pcre2_set_heap_limit().
871
872       The  default  value for the depth limit can be set when PCRE2 is built;
873       if it is not, the default is set to the same value as the  default  for
874       the   match   limit.   If  the  limit  is  exceeded,  pcre2_match()  or
875       pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth
876       limit  may also be supplied by an item at the start of a pattern of the
877       form
878
879         (*LIMIT_DEPTH=ddd)
880
881       where ddd is a decimal number.  However,  such  a  setting  is  ignored
882       unless ddd is less than the limit set by the caller of pcre2_match() or
883       pcre2_dfa_match() or, if no such limit is set, less than the default.
884

CHECKING BUILD-TIME OPTIONS

886
887       int pcre2_config(uint32_t what, void *where);
888
889       The function pcre2_config() makes it possible for  a  PCRE2  client  to
890       discover  which  optional  features  have  been compiled into the PCRE2
891       library. The pcre2build documentation  has  more  details  about  these
892       optional features.
893
894       The  first  argument  for pcre2_config() specifies which information is
895       required. The second argument is a pointer to  memory  into  which  the
896       information  is  placed.  If  NULL  is passed, the function returns the
897       amount of memory that is needed  for  the  requested  information.  For
898       calls  that  return  numerical  values,  the  value  is  in bytes; when
899       requesting these values, where should point  to  appropriately  aligned
900       memory.  For calls that return strings, the required length is given in
901       code units, not counting the terminating zero.
902
903       When requesting information, the returned value from pcre2_config()  is
904       non-negative  on success, or the negative error code PCRE2_ERROR_BADOP‐
905       TION if the value in the first argument is not recognized. The  follow‐
906       ing information is available:
907
908         PCRE2_CONFIG_BSR
909
910       The  output  is a uint32_t integer whose value indicates what character
911       sequences the \R  escape  sequence  matches  by  default.  A  value  of
912       PCRE2_BSR_UNICODE  means  that  \R  matches  any  Unicode  line  ending
913       sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches  only  CR,
914       LF, or CRLF. The default can be overridden when a pattern is compiled.
915
916         PCRE2_CONFIG_COMPILED_WIDTHS
917
918       The  output  is a uint32_t integer whose lower bits indicate which code
919       unit widths were selected when PCRE2 was  built.  The  1-bit  indicates
920       8-bit  support, and the 2-bit and 4-bit indicate 16-bit and 32-bit sup‐
921       port, respectively.
922
923         PCRE2_CONFIG_DEPTHLIMIT
924
925       The output is a uint32_t integer that gives the default limit  for  the
926       depth  of  nested  backtracking in pcre2_match() or the depth of nested
927       recursions, lookarounds, and atomic groups in  pcre2_dfa_match().  Fur‐
928       ther details are given with pcre2_set_depth_limit() above.
929
930         PCRE2_CONFIG_HEAPLIMIT
931
932       The  output is a uint32_t integer that gives, in kibibytes, the default
933       limit  for  the  amount  of  heap  memory  used  by  pcre2_match()   or
934       pcre2_dfa_match().      Further      details     are     given     with
935       pcre2_set_heap_limit() above.
936
937         PCRE2_CONFIG_JIT
938
939       The output is a uint32_t integer that is set  to  one  if  support  for
940       just-in-time compiling is available; otherwise it is set to zero.
941
942         PCRE2_CONFIG_JITTARGET
943
944       The  where  argument  should point to a buffer that is at least 48 code
945       units long.  (The  exact  length  required  can  be  found  by  calling
946       pcre2_config()  with  where  set  to NULL.) The buffer is filled with a
947       string that contains the name of the architecture  for  which  the  JIT
948       compiler  is  configured,  for  example  "x86  32bit  (little  endian +
949       unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION  is
950       returned,  otherwise the number of code units used is returned. This is
951       the length of the string, plus one unit for the terminating zero.
952
953         PCRE2_CONFIG_LINKSIZE
954
955       The output is a uint32_t integer that contains the number of bytes used
956       for  internal  linkage  in  compiled regular expressions. When PCRE2 is
957       configured, the value can be set to 2, 3, or 4, with the default  being
958       2.  This is the value that is returned by pcre2_config(). However, when
959       the 16-bit library is compiled, a value of 3 is rounded up  to  4,  and
960       when  the  32-bit  library  is compiled, internal linkages always use 4
961       bytes, so the configured value is not relevant.
962
963       The default value of 2 for the 8-bit and 16-bit libraries is sufficient
964       for  all but the most massive patterns, since it allows the size of the
965       compiled pattern to be up to 65535  code  units.  Larger  values  allow
966       larger  regular  expressions to be compiled by those two libraries, but
967       at the expense of slower matching.
968
969         PCRE2_CONFIG_MATCHLIMIT
970
971       The output is a uint32_t integer that gives the default match limit for
972       pcre2_match().  Further  details are given with pcre2_set_match_limit()
973       above.
974
975         PCRE2_CONFIG_NEWLINE
976
977       The output is a uint32_t integer  whose  value  specifies  the  default
978       character  sequence that is recognized as meaning "newline". The values
979       are:
980
981         PCRE2_NEWLINE_CR       Carriage return (CR)
982         PCRE2_NEWLINE_LF       Linefeed (LF)
983         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
984         PCRE2_NEWLINE_ANY      Any Unicode line ending
985         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
986         PCRE2_NEWLINE_NUL      The NUL character (binary zero)
987
988       The default should normally correspond to  the  standard  sequence  for
989       your operating system.
990
991         PCRE2_CONFIG_NEVER_BACKSLASH_C
992
993       The  output  is  a uint32_t integer that is set to one if the use of \C
994       was permanently disabled when PCRE2 was built; otherwise it is  set  to
995       zero.
996
997         PCRE2_CONFIG_PARENSLIMIT
998
999       The  output is a uint32_t integer that gives the maximum depth of nest‐
1000       ing of parentheses (of any kind) in a pattern. This limit is imposed to
1001       cap  the  amount of system stack used when a pattern is compiled. It is
1002       specified when PCRE2 is built; the default is 250. This limit does  not
1003       take  into  account  the  stack that may already be used by the calling
1004       application. For  finer  control  over  compilation  stack  usage,  see
1005       pcre2_set_compile_recursion_guard().
1006
1007         PCRE2_CONFIG_STACKRECURSE
1008
1009       This parameter is obsolete and should not be used in new code. The out‐
1010       put is a uint32_t integer that is always set to zero.
1011
1012         PCRE2_CONFIG_UNICODE_VERSION
1013
1014       The where argument should point to a buffer that is at  least  24  code
1015       units  long.  (The  exact  length  required  can  be  found  by calling
1016       pcre2_config() with where set to NULL.)  If  PCRE2  has  been  compiled
1017       without  Unicode  support,  the buffer is filled with the text "Unicode
1018       not supported". Otherwise, the Unicode  version  string  (for  example,
1019       "8.0.0")  is  inserted. The number of code units used is returned. This
1020       is the length of the string plus one unit for the terminating zero.
1021
1022         PCRE2_CONFIG_UNICODE
1023
1024       The output is a uint32_t integer that is set to one if Unicode  support
1025       is  available; otherwise it is set to zero. Unicode support implies UTF
1026       support.
1027
1028         PCRE2_CONFIG_VERSION
1029
1030       The where argument should point to a buffer that is at  least  24  code
1031       units  long.  (The  exact  length  required  can  be  found  by calling
1032       pcre2_config() with where set to NULL.) The buffer is filled  with  the
1033       PCRE2 version string, zero-terminated. The number of code units used is
1034       returned. This is the length of the string plus one unit for the termi‐
1035       nating zero.
1036

COMPILING A PATTERN

1038
1039       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
1040         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
1041         pcre2_compile_context *ccontext);
1042
1043       void pcre2_code_free(pcre2_code *code);
1044
1045       pcre2_code *pcre2_code_copy(const pcre2_code *code);
1046
1047       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
1048
1049       The  pcre2_compile() function compiles a pattern into an internal form.
1050       The pattern is defined by a pointer to a string of  code  units  and  a
1051       length  (in  code units). If the pattern is zero-terminated, the length
1052       can be specified  as  PCRE2_ZERO_TERMINATED.  The  function  returns  a
1053       pointer  to  a  block  of memory that contains the compiled pattern and
1054       related data, or NULL if an error occurred.
1055
1056       If the compile context argument ccontext is NULL, memory for  the  com‐
1057       piled  pattern  is  obtained  by  calling  malloc().  Otherwise,  it is
1058       obtained from the same memory function that was used  for  the  compile
1059       context.  The  caller must free the memory by calling pcre2_code_free()
1060       when it is no longer needed.  If pcre2_code_free()  is  called  with  a
1061       NULL argument, it returns immediately, without doing anything.
1062
1063       The function pcre2_code_copy() makes a copy of the compiled code in new
1064       memory, using the same memory allocator as was used for  the  original.
1065       However,  if  the  code  has  been  processed  by the JIT compiler (see
1066       below), the JIT information cannot be copied (because it  is  position-
1067       dependent).  The new copy can initially be used only for non-JIT match‐
1068       ing, though it can be passed to  pcre2_jit_compile()  if  required.  If
1069       pcre2_code_copy() is called with a NULL argument, it returns NULL.
1070
1071       The pcre2_code_copy() function provides a way for individual threads in
1072       a multithreaded application to acquire a private copy  of  shared  com‐
1073       piled  code.   However, it does not make a copy of the character tables
1074       used by the compiled pattern; the new pattern code points to  the  same
1075       tables  as  the original code.  (See "Locale Support" below for details
1076       of these character tables.) In many applications the  same  tables  are
1077       used  throughout, so this behaviour is appropriate. Nevertheless, there
1078       are occasions when a copy of a compiled pattern and the relevant tables
1079       are  needed.  The pcre2_code_copy_with_tables() provides this facility.
1080       Copies of both the code and the tables are  made,  with  the  new  code
1081       pointing  to the new tables. The memory for the new tables is automati‐
1082       cally freed when pcre2_code_free() is called for the new  copy  of  the
1083       compiled  code. If pcre2_code_copy_withy_tables() is called with a NULL
1084       argument, it returns NULL.
1085
1086       NOTE: When one of the matching functions is  called,  pointers  to  the
1087       compiled pattern and the subject string are set in the match data block
1088       so that they can be referenced by the substring  extraction  functions.
1089       After  running a match, you must not free a compiled pattern (or a sub‐
1090       ject string) until after all operations on the match  data  block  have
1091       taken place.
1092
1093       The  options argument for pcre2_compile() contains various bit settings
1094       that affect the compilation. It  should  be  zero  if  no  options  are
1095       required.  The  available options are described below. Some of them (in
1096       particular, those that are compatible with Perl,  but  some  others  as
1097       well)  can  also  be  set  and  unset  from within the pattern (see the
1098       detailed description in the pcre2pattern documentation).
1099
1100       For those options that can be different in different parts of the  pat‐
1101       tern,  the contents of the options argument specifies their settings at
1102       the start of compilation. The  PCRE2_ANCHORED,  PCRE2_ENDANCHORED,  and
1103       PCRE2_NO_UTF_CHECK  options  can be set at the time of matching as well
1104       as at compile time.
1105
1106       Other, less frequently required compile-time parameters  (for  example,
1107       the newline setting) can be provided in a compile context (as described
1108       above).
1109
1110       If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme‐
1111       diately.  Otherwise,  the  variables to which these point are set to an
1112       error code and an offset (number of code  units)  within  the  pattern,
1113       respectively,  when  pcre2_compile() returns NULL because a compilation
1114       error has occurred. The values are not defined when compilation is suc‐
1115       cessful and pcre2_compile() returns a non-NULL value.
1116
1117       There  are  nearly  100  positive  error codes that pcre2_compile() may
1118       return if it finds an error in the pattern. There are also  some  nega‐
1119       tive  error  codes that are used for invalid UTF strings. These are the
1120       same as given by pcre2_match() and pcre2_dfa_match(), and are described
1121       in  the  pcre2unicode  page. There is no separate documentation for the
1122       positive error codes, because  the  textual  error  messages  that  are
1123       obtained   by   calling  the  pcre2_get_error_message()  function  (see
1124       "Obtaining a textual error message" below) should be  self-explanatory.
1125       Macro  names  starting  with PCRE2_ERROR_ are defined for both positive
1126       and negative error codes in pcre2.h.
1127
1128       The value returned in erroroffset is an indication of where in the pat‐
1129       tern  the  error  occurred. It is not necessarily the furthest point in
1130       the pattern that was read. For example,  after  the  error  "lookbehind
1131       assertion is not fixed length", the error offset points to the start of
1132       the failing assertion. For an invalid UTF-8 or UTF-16 string, the  off‐
1133       set is that of the first code unit of the failing character.
1134
1135       Some  errors are not detected until the whole pattern has been scanned;
1136       in these cases, the offset passed back is the length  of  the  pattern.
1137       Note  that  the  offset is in code units, not characters, even in a UTF
1138       mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char‐
1139       acter.
1140
1141       This  code  fragment shows a typical straightforward call to pcre2_com‐
1142       pile():
1143
1144         pcre2_code *re;
1145         PCRE2_SIZE erroffset;
1146         int errorcode;
1147         re = pcre2_compile(
1148           "^A.*Z",                /* the pattern */
1149           PCRE2_ZERO_TERMINATED,  /* the pattern is zero-terminated */
1150           0,                      /* default options */
1151           &errorcode,             /* for error code */
1152           &erroffset,             /* for error offset */
1153           NULL);                  /* no compile context */
1154
1155       The following names for option bits are defined in the  pcre2.h  header
1156       file:
1157
1158         PCRE2_ANCHORED
1159
1160       If this bit is set, the pattern is forced to be "anchored", that is, it
1161       is constrained to match only at the first matching point in the  string
1162       that  is being searched (the "subject string"). This effect can also be
1163       achieved by appropriate constructs in the pattern itself, which is  the
1164       only way to do it in Perl.
1165
1166         PCRE2_ALLOW_EMPTY_CLASS
1167
1168       By  default, for compatibility with Perl, a closing square bracket that
1169       immediately follows an opening one is treated as a data  character  for
1170       the  class.  When  PCRE2_ALLOW_EMPTY_CLASS  is  set,  it terminates the
1171       class, which therefore contains no characters and so can never match.
1172
1173         PCRE2_ALT_BSUX
1174
1175       This option request alternative handling  of  three  escape  sequences,
1176       which  makes  PCRE2's  behaviour more like ECMAscript (aka JavaScript).
1177       When it is set:
1178
1179       (1) \U matches an upper case "U" character; by default \U causes a com‐
1180       pile time error (Perl uses \U to upper case subsequent characters).
1181
1182       (2) \u matches a lower case "u" character unless it is followed by four
1183       hexadecimal digits, in which case the hexadecimal  number  defines  the
1184       code  point  to match. By default, \u causes a compile time error (Perl
1185       uses it to upper case the following character).
1186
1187       (3) \x matches a lower case "x" character unless it is followed by  two
1188       hexadecimal  digits,  in  which case the hexadecimal number defines the
1189       code point to match. By default, as in Perl, a  hexadecimal  number  is
1190       always expected after \x, but it may have zero, one, or two digits (so,
1191       for example, \xz matches a binary zero character followed by z).
1192
1193         PCRE2_ALT_CIRCUMFLEX
1194
1195       In  multiline  mode  (when  PCRE2_MULTILINE  is  set),  the  circumflex
1196       metacharacter  matches at the start of the subject (unless PCRE2_NOTBOL
1197       is set), and also after any internal  newline.  However,  it  does  not
1198       match after a newline at the end of the subject, for compatibility with
1199       Perl. If you want a multiline circumflex also to match after  a  termi‐
1200       nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
1201
1202         PCRE2_ALT_VERBNAMES
1203
1204       By  default, for compatibility with Perl, the name in any verb sequence
1205       such as (*MARK:NAME) is  any  sequence  of  characters  that  does  not
1206       include  a  closing  parenthesis. The name is not processed in any way,
1207       and it is not possible to include a closing parenthesis  in  the  name.
1208       However,  if  the  PCRE2_ALT_VERBNAMES  option is set, normal backslash
1209       processing is applied to verb  names  and  only  an  unescaped  closing
1210       parenthesis  terminates the name. A closing parenthesis can be included
1211       in a name either as \) or between \Q and \E. If the  PCRE2_EXTENDED  or
1212       PCRE2_EXTENDED_MORE  option  is set with PCRE2_ALT_VERBNAMES, unescaped
1213       whitespace in verb names is  skipped  and  #-comments  are  recognized,
1214       exactly as in the rest of the pattern.
1215
1216         PCRE2_AUTO_CALLOUT
1217
1218       If  this  bit  is  set,  pcre2_compile()  automatically inserts callout
1219       items, all with number 255, before each pattern  item,  except  immedi‐
1220       ately  before  or after an explicit callout in the pattern. For discus‐
1221       sion of the callout facility, see the pcre2callout documentation.
1222
1223         PCRE2_CASELESS
1224
1225       If this bit is set, letters in the pattern match both upper  and  lower
1226       case  letters in the subject. It is equivalent to Perl's /i option, and
1227       it can be changed within  a  pattern  by  a  (?i)  option  setting.  If
1228       PCRE2_UTF  is  set, Unicode properties are used for all characters with
1229       more than one other case, and for all characters whose code points  are
1230       greater  than  U+007F.  For lower valued characters with only one other
1231       case, a lookup table is used for speed. When PCRE2_UTF is  not  set,  a
1232       lookup table is used for all code points less than 256, and higher code
1233       points (available only in 16-bit or 32-bit mode)  are  treated  as  not
1234       having another case.
1235
1236         PCRE2_DOLLAR_ENDONLY
1237
1238       If  this bit is set, a dollar metacharacter in the pattern matches only
1239       at the end of the subject string. Without this option,  a  dollar  also
1240       matches  immediately before a newline at the end of the string (but not
1241       before any other newlines). The PCRE2_DOLLAR_ENDONLY option is  ignored
1242       if  PCRE2_MULTILINE  is  set.  There is no equivalent to this option in
1243       Perl, and no way to set it within a pattern.
1244
1245         PCRE2_DOTALL
1246
1247       If this bit is set, a dot metacharacter  in  the  pattern  matches  any
1248       character,  including  one  that  indicates a newline. However, it only
1249       ever matches one character, even if newlines are coded as CRLF. Without
1250       this option, a dot does not match when the current position in the sub‐
1251       ject is at a newline. This option is equivalent to  Perl's  /s  option,
1252       and it can be changed within a pattern by a (?s) option setting. A neg‐
1253       ative class such as [^a] always matches newline characters, and the  \N
1254       escape  sequence always matches a non-newline character, independent of
1255       the setting of PCRE2_DOTALL.
1256
1257         PCRE2_DUPNAMES
1258
1259       If this bit is set, names used to identify capturing  subpatterns  need
1260       not be unique. This can be helpful for certain types of pattern when it
1261       is known that only one instance of the named  subpattern  can  ever  be
1262       matched.  There  are  more details of named subpatterns below; see also
1263       the pcre2pattern documentation.
1264
1265         PCRE2_ENDANCHORED
1266
1267       If this bit is set, the end of any pattern match must be right  at  the
1268       end of the string being searched (the "subject string"). If the pattern
1269       match succeeds by reaching (*ACCEPT), but does not reach the end of the
1270       subject,  the match fails at the current starting point. For unanchored
1271       patterns, a new match is then tried at the next  starting  point.  How‐
1272       ever, if the match succeeds by reaching the end of the pattern, but not
1273       the end of the subject, backtracking occurs and  an  alternative  match
1274       may be found. Consider these two patterns:
1275
1276         .(*ACCEPT)|..
1277         .|..
1278
1279       If  matched against "abc" with PCRE2_ENDANCHORED set, the first matches
1280       "c" whereas the second matches "bc". The  effect  of  PCRE2_ENDANCHORED
1281       can  also  be achieved by appropriate constructs in the pattern itself,
1282       which is the only way to do it in Perl.
1283
1284       For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only
1285       to  the  first  (that  is,  the longest) matched string. Other parallel
1286       matches, which are necessarily substrings of the first one, must  obvi‐
1287       ously end before the end of the subject.
1288
1289         PCRE2_EXTENDED
1290
1291       If  this  bit  is  set,  most white space characters in the pattern are
1292       totally ignored except when escaped or inside a character  class.  How‐
1293       ever,  white  space  is  not  allowed within sequences such as (?> that
1294       introduce various parenthesized subpatterns, nor within numerical quan‐
1295       tifiers  such  as {1,3}.  Ignorable white space is permitted between an
1296       item and a following quantifier and between a quantifier and a  follow‐
1297       ing  +  that indicates possessiveness.  PCRE2_EXTENDED is equivalent to
1298       Perl's /x option, and it can be changed within  a  pattern  by  a  (?x)
1299       option setting.
1300
1301       When  PCRE2  is compiled without Unicode support, PCRE2_EXTENDED recog‐
1302       nizes as white space only those characters with code points  less  than
1303       256 that are flagged as white space in its low-character table. The ta‐
1304       ble is normally created by pcre2_maketables(), which uses the isspace()
1305       function  to identify space characters. In most ASCII environments, the
1306       relevant characters are those with code  points  0x0009  (tab),  0x000A
1307       (linefeed),  0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage
1308       return), and 0x0020 (space).
1309
1310       When PCRE2 is compiled with Unicode support, in addition to these char‐
1311       acters,  five  more Unicode "Pattern White Space" characters are recog‐
1312       nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to-
1313       right  mark), U+200F (right-to-left mark), U+2028 (line separator), and
1314       U+2029 (paragraph separator). This set of characters  is  the  same  as
1315       recognized  by  Perl's /x option. Note that the horizontal and vertical
1316       space characters that are matched by the \h and \v escapes in  patterns
1317       are a much bigger set.
1318
1319       As  well as ignoring most white space, PCRE2_EXTENDED also causes char‐
1320       acters between an unescaped # outside a character class  and  the  next
1321       newline,  inclusive,  to be ignored, which makes it possible to include
1322       comments inside complicated patterns. Note that the end of this type of
1323       comment  is a literal newline sequence in the pattern; escape sequences
1324       that happen to represent a newline do not count.
1325
1326       Which characters are interpreted as newlines can be specified by a set‐
1327       ting  in  the compile context that is passed to pcre2_compile() or by a
1328       special sequence at the start of the pattern, as described in the  sec‐
1329       tion  entitled "Newline conventions" in the pcre2pattern documentation.
1330       A default is defined when PCRE2 is built.
1331
1332         PCRE2_EXTENDED_MORE
1333
1334       This option  has  the  effect  of  PCRE2_EXTENDED,  but,  in  addition,
1335       unescaped  space  and  horizontal  tab  characters are ignored inside a
1336       character class. Note: only these two characters are ignored,  not  the
1337       full  set  of pattern white space characters that are ignored outside a
1338       character  class.  PCRE2_EXTENDED_MORE  is  equivalent  to  Perl's  /xx
1339       option,  and  it can be changed within a pattern by a (?xx) option set‐
1340       ting.
1341
1342         PCRE2_FIRSTLINE
1343
1344       If this option is set, the start of an unanchored pattern match must be
1345       before  or  at  the  first  newline in the subject string following the
1346       start of matching, though the matched text may continue over  the  new‐
1347       line. If startoffset is non-zero, the limiting newline is not necessar‐
1348       ily the first newline in the  subject.  For  example,  if  the  subject
1349       string is "abc\nxyz" (where \n represents a single-character newline) a
1350       pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset  is
1351       greater  than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
1352       general limiting facility. If PCRE2_FIRSTLINE is  set  with  an  offset
1353       limit,  a match must occur in the first line and also within the offset
1354       limit. In other words, whichever limit comes first is used.
1355
1356         PCRE2_LITERAL
1357
1358       If this option is set, all meta-characters in the pattern are disabled,
1359       and  it is treated as a literal string. Matching literal strings with a
1360       regular expression engine is not the most efficient way of doing it. If
1361       you  are  doing  a  lot of literal matching and are worried about effi‐
1362       ciency, you should consider using other approaches. The only other main
1363       options  that  are  allowed  with  PCRE2_LITERAL  are:  PCRE2_ANCHORED,
1364       PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE,
1365       PCRE2_NO_START_OPTIMIZE,     PCRE2_NO_UTF_CHECK,     PCRE2_UTF,     and
1366       PCRE2_USE_OFFSET_LIMIT. The extra  options  PCRE2_EXTRA_MATCH_LINE  and
1367       PCRE2_EXTRA_MATCH_WORD  are  also supported. Any other options cause an
1368       error.
1369
1370         PCRE2_MATCH_UNSET_BACKREF
1371
1372       If this option is set, a backreference to  an  unset  subpattern  group
1373       matches  an  empty  string (by default this causes the current matching
1374       alternative to fail).  A pattern such as  (\1)(a)  succeeds  when  this
1375       option  is set (assuming it can find an "a" in the subject), whereas it
1376       fails by default, for Perl compatibility.  Setting  this  option  makes
1377       PCRE2 behave more like ECMAscript (aka JavaScript).
1378
1379         PCRE2_MULTILINE
1380
1381       By  default,  for  the purposes of matching "start of line" and "end of
1382       line", PCRE2 treats the subject string as consisting of a  single  line
1383       of  characters,  even  if  it actually contains newlines. The "start of
1384       line" metacharacter (^) matches only at the start of  the  string,  and
1385       the  "end  of  line"  metacharacter  ($) matches only at the end of the
1386       string,  or  before  a  terminating  newline  (except  when  PCRE2_DOL‐
1387       LAR_ENDONLY  is  set).  Note, however, that unless PCRE2_DOTALL is set,
1388       the "any character" metacharacter (.) does not match at a newline. This
1389       behaviour (for ^, $, and dot) is the same as Perl.
1390
1391       When  PCRE2_MULTILINE  it is set, the "start of line" and "end of line"
1392       constructs match immediately following or immediately  before  internal
1393       newlines  in  the  subject string, respectively, as well as at the very
1394       start and end. This is equivalent to Perl's /m option, and  it  can  be
1395       changed within a pattern by a (?m) option setting. Note that the "start
1396       of line" metacharacter does not match after a newline at the end of the
1397       subject,  for compatibility with Perl.  However, you can change this by
1398       setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in  a
1399       subject  string,  or  no  occurrences  of  ^ or $ in a pattern, setting
1400       PCRE2_MULTILINE has no effect.
1401
1402         PCRE2_NEVER_BACKSLASH_C
1403
1404       This option locks out the use of \C in the pattern that is  being  com‐
1405       piled.   This  escape  can  cause  unpredictable  behaviour in UTF-8 or
1406       UTF-16 modes, because it may leave the current matching  point  in  the
1407       middle  of  a  multi-code-unit  character. This option may be useful in
1408       applications that process patterns from  external  sources.  Note  that
1409       there is also a build-time option that permanently locks out the use of
1410       \C.
1411
1412         PCRE2_NEVER_UCP
1413
1414       This option locks out the use of Unicode properties  for  handling  \B,
1415       \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
1416       described for the PCRE2_UCP option below. In  particular,  it  prevents
1417       the  creator of the pattern from enabling this facility by starting the
1418       pattern with (*UCP). This option may be  useful  in  applications  that
1419       process patterns from external sources. The option combination PCRE_UCP
1420       and PCRE_NEVER_UCP causes an error.
1421
1422         PCRE2_NEVER_UTF
1423
1424       This option locks out interpretation of the pattern as  UTF-8,  UTF-16,
1425       or UTF-32, depending on which library is in use. In particular, it pre‐
1426       vents the creator of the pattern from switching to  UTF  interpretation
1427       by  starting  the  pattern  with  (*UTF).  This option may be useful in
1428       applications that process patterns from external sources. The  combina‐
1429       tion of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
1430
1431         PCRE2_NO_AUTO_CAPTURE
1432
1433       If this option is set, it disables the use of numbered capturing paren‐
1434       theses in the pattern. Any opening parenthesis that is not followed  by
1435       ?  behaves as if it were followed by ?: but named parentheses can still
1436       be used for capturing (and they acquire numbers in the usual way). This
1437       is  the  same as Perl's /n option.  Note that, when this option is set,
1438       references to capturing groups (backreferences or  recursion/subroutine
1439       calls)  may  only refer to named groups, though the reference can be by
1440       name or by number.
1441
1442         PCRE2_NO_AUTO_POSSESS
1443
1444       If this option is set, it disables "auto-possessification", which is an
1445       optimization  that,  for example, turns a+b into a++b in order to avoid
1446       backtracks into a+ that can never be successful. However,  if  callouts
1447       are  in  use,  auto-possessification means that some callouts are never
1448       taken. You can set this option if you want the matching functions to do
1449       a  full  unoptimized  search and run all the callouts, but it is mainly
1450       provided for testing purposes.
1451
1452         PCRE2_NO_DOTSTAR_ANCHOR
1453
1454       If this option is set, it disables an optimization that is applied when
1455       .*  is  the  first significant item in a top-level branch of a pattern,
1456       and all the other branches also start with .* or with \A or  \G  or  ^.
1457       The  optimization  is  automatically disabled for .* if it is inside an
1458       atomic group or a capturing group that is the subject of  a  backrefer‐
1459       ence,  or  if  the pattern contains (*PRUNE) or (*SKIP). When the opti‐
1460       mization is not disabled, such a pattern is automatically  anchored  if
1461       PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set
1462       for any ^ items. Otherwise, the fact that any match must  start  either
1463       at  the start of the subject or following a newline is remembered. Like
1464       other optimizations, this can cause callouts to be skipped.
1465
1466         PCRE2_NO_START_OPTIMIZE
1467
1468       This is an option whose main effect is at matching time.  It  does  not
1469       change what pcre2_compile() generates, but it does affect the output of
1470       the JIT compiler.
1471
1472       There are a number of optimizations that may occur at the  start  of  a
1473       match,  in  order  to speed up the process. For example, if it is known
1474       that an unanchored match must start with a specific  code  unit  value,
1475       the  matching code searches the subject for that value, and fails imme‐
1476       diately if it cannot find it, without actually running the main  match‐
1477       ing  function.  This means that a special item such as (*COMMIT) at the
1478       start of a pattern is not considered until after  a  suitable  starting
1479       point  for  the  match  has  been found. Also, when callouts or (*MARK)
1480       items are in use, these "start-up" optimizations can cause them  to  be
1481       skipped  if  the pattern is never actually used. The start-up optimiza‐
1482       tions are in effect a pre-scan of the subject that takes  place  before
1483       the pattern is run.
1484
1485       The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
1486       possibly causing performance to suffer,  but  ensuring  that  in  cases
1487       where  the  result is "no match", the callouts do occur, and that items
1488       such as (*COMMIT) and (*MARK) are considered at every possible starting
1489       position in the subject string.
1490
1491       Setting  PCRE2_NO_START_OPTIMIZE  may  change the outcome of a matching
1492       operation.  Consider the pattern
1493
1494         (*COMMIT)ABC
1495
1496       When this is compiled, PCRE2 records the fact that a match  must  start
1497       with  the  character  "A".  Suppose the subject string is "DEFABC". The
1498       start-up optimization scans along the subject, finds "A" and  runs  the
1499       first  match attempt from there. The (*COMMIT) item means that the pat‐
1500       tern must match the current starting position, which in this  case,  it
1501       does.  However,  if  the same match is run with PCRE2_NO_START_OPTIMIZE
1502       set, the initial scan along the subject string  does  not  happen.  The
1503       first  match  attempt  is  run  starting  from "D" and when this fails,
1504       (*COMMIT) prevents any further matches  being  tried,  so  the  overall
1505       result is "no match".
1506
1507       There  are  also  other  start-up optimizations. For example, a minimum
1508       length for the subject may be recorded. Consider the pattern
1509
1510         (*MARK:A)(X|Y)
1511
1512       The minimum length for a match is one  character.  If  the  subject  is
1513       "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
1514       to match an empty string at the end of the subject does not take place,
1515       because  PCRE2  knows  that  the  subject  is now too short, and so the
1516       (*MARK) is never encountered. In this case, the optimization  does  not
1517       affect the overall match result, which is still "no match", but it does
1518       affect the auxiliary information that is returned.
1519
1520         PCRE2_NO_UTF_CHECK
1521
1522       When PCRE2_UTF is set, the validity of the pattern as a UTF  string  is
1523       automatically  checked.  There  are  discussions  about the validity of
1524       UTF-8 strings, UTF-16 strings, and UTF-32 strings in  the  pcre2unicode
1525       document.  If an invalid UTF sequence is found, pcre2_compile() returns
1526       a negative error code.
1527
1528       If you know that your pattern is a valid UTF string, and  you  want  to
1529       skip   this   check   for   performance   reasons,   you  can  set  the
1530       PCRE2_NO_UTF_CHECK option. When it is set, the  effect  of  passing  an
1531       invalid UTF string as a pattern is undefined. It may cause your program
1532       to crash or loop.
1533
1534       Note  that  this  option  can  also  be  passed  to  pcre2_match()  and
1535       pcre_dfa_match(),  to  suppress  UTF  validity  checking of the subject
1536       string.
1537
1538       Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis‐
1539       able  the error that is given if an escape sequence for an invalid Uni‐
1540       code code point is encountered in the pattern. In particular,  the  so-
1541       called  "surrogate"  code points (0xd800 to 0xdfff) are invalid. If you
1542       want to allow escape  sequences  such  as  \x{d800}  you  can  set  the
1543       PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  extra  option, as described in the
1544       section entitled "Extra compile options" below.  However, this is  pos‐
1545       sible only in UTF-8 and UTF-32 modes, because these values are not rep‐
1546       resentable in UTF-16.
1547
1548         PCRE2_UCP
1549
1550       This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
1551       \w,  and  some  of  the POSIX character classes. By default, only ASCII
1552       characters are recognized, but if PCRE2_UCP is set, Unicode  properties
1553       are  used instead to classify characters. More details are given in the
1554       section on generic character types in the pcre2pattern page. If you set
1555       PCRE2_UCP,  matching one of the items it affects takes much longer. The
1556       option is available only if PCRE2 has been compiled with  Unicode  sup‐
1557       port (which is the default).
1558
1559         PCRE2_UNGREEDY
1560
1561       This  option  inverts  the "greediness" of the quantifiers so that they
1562       are not greedy by default, but become greedy if followed by "?". It  is
1563       not  compatible  with Perl. It can also be set by a (?U) option setting
1564       within the pattern.
1565
1566         PCRE2_USE_OFFSET_LIMIT
1567
1568       This option must be set for pcre2_compile() if pcre2_set_offset_limit()
1569       is  going  to be used to set a non-default offset limit in a match con‐
1570       text for matches that use this pattern. An error  is  generated  if  an
1571       offset  limit  is  set  without  this option. For more details, see the
1572       description of pcre2_set_offset_limit() in the section  that  describes
1573       match contexts. See also the PCRE2_FIRSTLINE option above.
1574
1575         PCRE2_UTF
1576
1577       This  option  causes  PCRE2  to regard both the pattern and the subject
1578       strings that are subsequently processed as strings  of  UTF  characters
1579       instead  of  single-code-unit  strings.  It  is available when PCRE2 is
1580       built to include Unicode support (which is  the  default).  If  Unicode
1581       support  is  not  available,  the use of this option provokes an error.
1582       Details of how PCRE2_UTF changes the behaviour of PCRE2  are  given  in
1583       the  pcre2unicode  page.  In  particular,  note that it changes the way
1584       PCRE2_CASELESS handles characters with code points greater than 127.
1585
1586   Extra compile options
1587
1588       Unlike the main compile-time options, the extra options are  not  saved
1589       with the compiled pattern. The option bits that can be set in a compile
1590       context by calling the pcre2_set_compile_extra_options()  function  are
1591       as follows:
1592
1593         PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
1594
1595       This  option  applies when compiling a pattern in UTF-8 or UTF-32 mode.
1596       It is forbidden in UTF-16 mode, and ignored in non-UTF  modes.  Unicode
1597       "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs
1598       in UTF-16 to encode code points with values in  the  range  0x10000  to
1599       0x10ffff.  The  surrogates  cannot  therefore be represented in UTF-16.
1600       They can be represented in UTF-8 and UTF-32, but are defined as invalid
1601       code  points,  and  cause  errors  if  encountered in a UTF-8 or UTF-32
1602       string that is being checked for validity by PCRE2.
1603
1604       These values also cause errors if encountered in escape sequences  such
1605       as \x{d912} within a pattern. However, it seems that some applications,
1606       when using PCRE2 to check for unwanted  characters  in  UTF-8  strings,
1607       explicitly   test  for  the  surrogates  using  escape  sequences.  The
1608       PCRE2_NO_UTF_CHECK option does  not  disable  the  error  that  occurs,
1609       because  it applies only to the testing of input strings for UTF valid‐
1610       ity.
1611
1612       If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set,  surro‐
1613       gate  code  point values in UTF-8 and UTF-32 patterns no longer provoke
1614       errors and are incorporated in the compiled pattern. However, they  can
1615       only  match  subject characters if the matching function is called with
1616       PCRE2_NO_UTF_CHECK set.
1617
1618         PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
1619
1620       This is a dangerous option. Use with care. By default, an  unrecognized
1621       escape  such  as \j or a malformed one such as \x{2z} causes a compile-
1622       time error when detected by pcre2_compile(). Perl is somewhat inconsis‐
1623       tent  in  handling  such items: for example, \j is treated as a literal
1624       "j", and non-hexadecimal digits in \x{} are just ignored, though  warn‐
1625       ings  are given in both cases if Perl's warning switch is enabled. How‐
1626       ever, a malformed octal number after \o{  always  causes  an  error  in
1627       Perl.
1628
1629       If  the  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL  extra  option  is passed to
1630       pcre2_compile(), all unrecognized or  erroneous  escape  sequences  are
1631       treated  as  single-character escapes. For example, \j is a literal "j"
1632       and \x{2z} is treated as  the  literal  string  "x{2z}".  Setting  this
1633       option  means  that  typos in patterns may go undetected and have unex‐
1634       pected results. This is a dangerous option. Use with care.
1635
1636         PCRE2_EXTRA_MATCH_LINE
1637
1638       This option is provided for use by  the  -x  option  of  pcre2grep.  It
1639       causes  the  pattern  only to match complete lines. This is achieved by
1640       automatically inserting the code for "^(?:" at the start  of  the  com‐
1641       piled  pattern  and ")$" at the end. Thus, when PCRE2_MULTILINE is set,
1642       the matched line may be in the  middle  of  the  subject  string.  This
1643       option can be used with PCRE2_LITERAL.
1644
1645         PCRE2_EXTRA_MATCH_WORD
1646
1647       This  option  is  provided  for  use  by the -w option of pcre2grep. It
1648       causes the pattern only to match strings that have a word  boundary  at
1649       the  start and the end. This is achieved by automatically inserting the
1650       code for "\b(?:" at the start of the compiled pattern and ")\b" at  the
1651       end.  The option may be used with PCRE2_LITERAL. However, it is ignored
1652       if PCRE2_EXTRA_MATCH_LINE is also set.
1653

JUST-IN-TIME (JIT) COMPILATION

1655
1656       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
1657
1658       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
1659         PCRE2_SIZE length, PCRE2_SIZE startoffset,
1660         uint32_t options, pcre2_match_data *match_data,
1661         pcre2_match_context *mcontext);
1662
1663       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
1664
1665       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
1666         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
1667
1668       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
1669         pcre2_jit_callback callback_function, void *callback_data);
1670
1671       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
1672
1673       These functions provide support for  JIT  compilation,  which,  if  the
1674       just-in-time  compiler  is available, further processes a compiled pat‐
1675       tern into machine code that executes much faster than the pcre2_match()
1676       interpretive  matching function. Full details are given in the pcre2jit
1677       documentation.
1678
1679       JIT compilation is a heavyweight optimization. It can  take  some  time
1680       for  patterns  to  be analyzed, and for one-off matches and simple pat‐
1681       terns the benefit of faster execution might be offset by a much  slower
1682       compilation  time.  Most (but not all) patterns can be optimized by the
1683       JIT compiler.
1684

LOCALE SUPPORT

1686
1687       PCRE2 handles caseless matching, and determines whether characters  are
1688       letters,  digits, or whatever, by reference to a set of tables, indexed
1689       by character code point. This applies only  to  characters  whose  code
1690       points  are  less than 256. By default, higher-valued code points never
1691       match escapes such as \w or \d.  However, if PCRE2 is built  with  Uni‐
1692       code support, all characters can be tested with \p and \P, or, alterna‐
1693       tively, the PCRE2_UCP option can be set when  a  pattern  is  compiled;
1694       this  causes  \w and friends to use Unicode property support instead of
1695       the built-in tables.
1696
1697       The use of locales with Unicode is discouraged.  If  you  are  handling
1698       characters  with  code  points  greater than 128, you should either use
1699       Unicode support, or use locales, but not try to mix the two.
1700
1701       PCRE2 contains an internal set of character tables  that  are  used  by
1702       default.   These  are  sufficient  for many applications. Normally, the
1703       internal tables recognize only ASCII characters. However, when PCRE2 is
1704       built, it is possible to cause the internal tables to be rebuilt in the
1705       default "C" locale of the local system, which may cause them to be dif‐
1706       ferent.
1707
1708       The  internal tables can be overridden by tables supplied by the appli‐
1709       cation that calls PCRE2. These may be created  in  a  different  locale
1710       from  the  default.  As more and more applications change to using Uni‐
1711       code, the need for this locale support is expected to die away.
1712
1713       External tables are built by calling the  pcre2_maketables()  function,
1714       in  the relevant locale. The result can be passed to pcre2_compile() as
1715       often  as  necessary,  by  creating  a  compile  context  and   calling
1716       pcre2_set_character_tables()  to  set  the  tables pointer therein. For
1717       example, to build and use tables that are appropriate  for  the  French
1718       locale  (where  accented  characters  with  values greater than 128 are
1719       treated as letters), the following code could be used:
1720
1721         setlocale(LC_CTYPE, "fr_FR");
1722         tables = pcre2_maketables(NULL);
1723         ccontext = pcre2_compile_context_create(NULL);
1724         pcre2_set_character_tables(ccontext, tables);
1725         re = pcre2_compile(..., ccontext);
1726
1727       The locale name "fr_FR" is used on Linux and other  Unix-like  systems;
1728       if  you  are using Windows, the name for the French locale is "french".
1729       It is the caller's responsibility to ensure that the memory  containing
1730       the tables remains available for as long as it is needed.
1731
1732       The pointer that is passed (via the compile context) to pcre2_compile()
1733       is saved with the compiled pattern, and the same  tables  are  used  by
1734       pcre2_match()  and pcre_dfa_match(). Thus, for any single pattern, com‐
1735       pilation and matching both happen in the  same  locale,  but  different
1736       patterns can be processed in different locales.
1737

INFORMATION ABOUT A COMPILED PATTERN

1739
1740       int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
1741
1742       The  pcre2_pattern_info()  function returns general information about a
1743       compiled pattern. For information about callouts, see the next section.
1744       The  first  argument  for pcre2_pattern_info() is a pointer to the com‐
1745       piled pattern. The second argument specifies which piece of information
1746       is  required,  and  the  third  argument  is a pointer to a variable to
1747       receive the data. If the third argument is NULL, the first argument  is
1748       ignored,  and  the  function  returns the size in bytes of the variable
1749       that is required for the information requested. Otherwise, the yield of
1750       the function is zero for success, or one of the following negative num‐
1751       bers:
1752
1753         PCRE2_ERROR_NULL           the argument code was NULL
1754         PCRE2_ERROR_BADMAGIC       the "magic number" was not found
1755         PCRE2_ERROR_BADOPTION      the value of what was invalid
1756         PCRE2_ERROR_UNSET          the requested field is not set
1757
1758       The "magic number" is placed at the start of each compiled  pattern  as
1759       an  simple check against passing an arbitrary memory pointer. Here is a
1760       typical call of pcre2_pattern_info(), to obtain the length of the  com‐
1761       piled pattern:
1762
1763         int rc;
1764         size_t length;
1765         rc = pcre2_pattern_info(
1766           re,               /* result of pcre2_compile() */
1767           PCRE2_INFO_SIZE,  /* what is required */
1768           &length);         /* where to put the data */
1769
1770       The possible values for the second argument are defined in pcre2.h, and
1771       are as follows:
1772
1773         PCRE2_INFO_ALLOPTIONS
1774         PCRE2_INFO_ARGOPTIONS
1775         PCRE2_INFO_EXTRAOPTIONS
1776
1777       Return copies of the pattern's options. The third argument should point
1778       to  a  uint32_t  variable.  PCRE2_INFO_ARGOPTIONS  returns  exactly the
1779       options that were passed to pcre2_compile(), whereas  PCRE2_INFO_ALLOP‐
1780       TIONS  returns  the compile options as modified by any top-level (*XXX)
1781       option settings such as (*UTF) at the  start  of  the  pattern  itself.
1782       PCRE2_INFO_EXTRAOPTIONS  returns the extra options that were set in the
1783       compile context by calling the pcre2_set_compile_extra_options()  func‐
1784       tion.
1785
1786       For   example,   if  the  pattern  /(*UTF)abc/  is  compiled  with  the
1787       PCRE2_EXTENDED  option,  the  result   for   PCRE2_INFO_ALLOPTIONS   is
1788       PCRE2_EXTENDED  and  PCRE2_UTF.   Option settings such as (?i) that can
1789       change within a pattern do not affect the result  of  PCRE2_INFO_ALLOP‐
1790       TIONS, even if they appear right at the start of the pattern. (This was
1791       different in some earlier releases.)
1792
1793       A pattern compiled without PCRE2_ANCHORED is automatically anchored  by
1794       PCRE2 if the first significant item in every top-level branch is one of
1795       the following:
1796
1797         ^     unless PCRE2_MULTILINE is set
1798         \A    always
1799         \G    always
1800         .*    sometimes - see below
1801
1802       When .* is the first significant item, anchoring is possible only  when
1803       all the following are true:
1804
1805         .* is not in an atomic group
1806         .* is not in a capturing group that is the subject
1807              of a backreference
1808         PCRE2_DOTALL is in force for .*
1809         Neither (*PRUNE) nor (*SKIP) appears in the pattern
1810         PCRE2_NO_DOTSTAR_ANCHOR is not set
1811
1812       For  patterns  that are auto-anchored, the PCRE2_ANCHORED bit is set in
1813       the options returned for PCRE2_INFO_ALLOPTIONS.
1814
1815         PCRE2_INFO_BACKREFMAX
1816
1817       Return the number of the highest  backreference  in  the  pattern.  The
1818       third  argument should point to an uint32_t variable. Named subpatterns
1819       acquire numbers as well as names, and these count towards  the  highest
1820       backreference.   Backreferences such as \4 or \g{12} match the captured
1821       characters of the given group, but in addition, the check that  a  cap‐
1822       turing  group  is  set in a conditional subpattern such as (?(3)a|b) is
1823       also a backreference. Zero is returned if there are no backreferences.
1824
1825         PCRE2_INFO_BSR
1826
1827       The output is a uint32_t integer whose value indicates  what  character
1828       sequences  the \R escape sequence matches. A value of PCRE2_BSR_UNICODE
1829       means that \R matches any Unicode line  ending  sequence;  a  value  of
1830       PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF.
1831
1832         PCRE2_INFO_CAPTURECOUNT
1833
1834       Return  the highest capturing subpattern number in the pattern. In pat‐
1835       terns where (?| is not used, this is also the total number of capturing
1836       subpatterns.  The third argument should point to an uint32_t variable.
1837
1838         PCRE2_INFO_DEPTHLIMIT
1839
1840       If  the  pattern set a backtracking depth limit by including an item of
1841       the form (*LIMIT_DEPTH=nnnn) at the start, the value is  returned.  The
1842       third argument should point to a uint32_t integer. If no such value has
1843       been  set,  the  call  to  pcre2_pattern_info()   returns   the   error
1844       PCRE2_ERROR_UNSET. Note that this limit will only be used during match‐
1845       ing if it is less than the limit set or defaulted by the caller of  the
1846       match function.
1847
1848         PCRE2_INFO_FIRSTBITMAP
1849
1850       In  the absence of a single first code unit for a non-anchored pattern,
1851       pcre2_compile() may construct a 256-bit table that defines a fixed  set
1852       of  values for the first code unit in any match. For example, a pattern
1853       that starts with [abc] results in a table with  three  bits  set.  When
1854       code  unit  values greater than 255 are supported, the flag bit for 255
1855       means "any code unit of value 255 or above". If such a table  was  con‐
1856       structed,  a pointer to it is returned. Otherwise NULL is returned. The
1857       third argument should point to a const uint8_t * variable.
1858
1859         PCRE2_INFO_FIRSTCODETYPE
1860
1861       Return information about the first code unit of any matched string, for
1862       a  non-anchored pattern. The third argument should point to an uint32_t
1863       variable. If there is a fixed first value, for example, the letter  "c"
1864       from  a  pattern such as (cat|cow|coyote), 1 is returned, and the value
1865       can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is  no  fixed
1866       first  value,  but it is known that a match can occur only at the start
1867       of the subject or following a newline in the subject,  2  is  returned.
1868       Otherwise, and for anchored patterns, 0 is returned.
1869
1870         PCRE2_INFO_FIRSTCODEUNIT
1871
1872       Return  the  value  of  the first code unit of any matched string for a
1873       pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise  return  0.
1874       The  third  argument should point to an uint32_t variable. In the 8-bit
1875       library, the value is always less than 256. In the 16-bit  library  the
1876       value  can  be  up  to 0xffff. In the 32-bit library in UTF-32 mode the
1877       value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
1878       mode.
1879
1880         PCRE2_INFO_FRAMESIZE
1881
1882       Return the size (in bytes) of the data frames that are used to remember
1883       backtracking positions when the pattern is processed  by  pcre2_match()
1884       without  the  use  of  JIT. The third argument should point to a size_t
1885       variable. The frame size depends on the number of capturing parentheses
1886       in  the  pattern.  Each  additional capturing group adds two PCRE2_SIZE
1887       variables.
1888
1889         PCRE2_INFO_HASBACKSLASHC
1890
1891       Return 1 if the pattern contains any instances of \C, otherwise 0.  The
1892       third argument should point to an uint32_t variable.
1893
1894         PCRE2_INFO_HASCRORLF
1895
1896       Return  1  if  the  pattern  contains any explicit matches for CR or LF
1897       characters, otherwise 0. The third argument should point to an uint32_t
1898       variable.  An explicit match is either a literal CR or LF character, or
1899       \r or  \n  or  one  of  the  equivalent  hexadecimal  or  octal  escape
1900       sequences.
1901
1902         PCRE2_INFO_HEAPLIMIT
1903
1904       If the pattern set a heap memory limit by including an item of the form
1905       (*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argu‐
1906       ment should point to a uint32_t integer. If no such value has been set,
1907       the call to pcre2_pattern_info() returns the  error  PCRE2_ERROR_UNSET.
1908       Note  that  this  limit will only be used during matching if it is less
1909       than the limit set or defaulted by the caller of the match function.
1910
1911         PCRE2_INFO_JCHANGED
1912
1913       Return 1 if the (?J) or (?-J) option setting is used  in  the  pattern,
1914       otherwise  0.  The third argument should point to an uint32_t variable.
1915       (?J) and (?-J) set and unset the local PCRE2_DUPNAMES  option,  respec‐
1916       tively.
1917
1918         PCRE2_INFO_JITSIZE
1919
1920       If  the  compiled  pattern was successfully processed by pcre2_jit_com‐
1921       pile(), return the size of the  JIT  compiled  code,  otherwise  return
1922       zero. The third argument should point to a size_t variable.
1923
1924         PCRE2_INFO_LASTCODETYPE
1925
1926       Returns  1 if there is a rightmost literal code unit that must exist in
1927       any matched string, other than at its start. The third argument  should
1928       point  to  an  uint32_t  variable.  If  there  is  no  such value, 0 is
1929       returned. When 1 is  returned,  the  code  unit  value  itself  can  be
1930       retrieved  using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last
1931       literal value is recorded only if  it  follows  something  of  variable
1932       length.  For example, for the pattern /^a\d+z\d+/ the returned value is
1933       1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but  for  /^a\dz\d/
1934       the returned value is 0.
1935
1936         PCRE2_INFO_LASTCODEUNIT
1937
1938       Return  the value of the rightmost literal code unit that must exist in
1939       any matched string, other than  at  its  start,  for  a  pattern  where
1940       PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argu‐
1941       ment should point to an uint32_t variable.
1942
1943         PCRE2_INFO_MATCHEMPTY
1944
1945       Return 1 if the pattern might match an empty string, otherwise  0.  The
1946       third  argument  should  point  to an uint32_t variable. When a pattern
1947       contains recursive subroutine calls it is not always possible to deter‐
1948       mine  whether  or  not it can match an empty string. PCRE2 takes a cau‐
1949       tious approach and returns 1 in such cases.
1950
1951         PCRE2_INFO_MATCHLIMIT
1952
1953       If the pattern set a match limit by  including  an  item  of  the  form
1954       (*LIMIT_MATCH=nnnn)  at  the  start,  the  value is returned. The third
1955       argument should point to a uint32_t integer. If no such value has  been
1956       set,    the    call   to   pcre2_pattern_info()   returns   the   error
1957       PCRE2_ERROR_UNSET. Note that this limit will only be used during match‐
1958       ing  if it is less than the limit set or defaulted by the caller of the
1959       match function.
1960
1961         PCRE2_INFO_MAXLOOKBEHIND
1962
1963       Return the number of characters (not code units) in the longest lookbe‐
1964       hind  assertion  in  the  pattern. The third argument should point to a
1965       uint32_t integer. This information is useful when  doing  multi-segment
1966       matching  using  the  partial matching facilities. Note that the simple
1967       assertions \b and \B require a one-character lookbehind. \A also regis‐
1968       ters  a  one-character  lookbehind, though it does not actually inspect
1969       the previous character. This is to ensure that at least  one  character
1970       from  the old segment is retained when a new segment is processed. Oth‐
1971       erwise, if there are no lookbehinds in  the  pattern,  \A  might  match
1972       incorrectly at the start of a second or subsequent segment.
1973
1974         PCRE2_INFO_MINLENGTH
1975
1976       If  a  minimum  length  for  matching subject strings was computed, its
1977       value is returned. Otherwise the returned value is 0. The  value  is  a
1978       number  of characters, which in UTF mode may be different from the num‐
1979       ber of code units.  The third argument  should  point  to  an  uint32_t
1980       variable.  The  value  is  a  lower bound to the length of any matching
1981       string. There may not be any strings of that length  that  do  actually
1982       match, but every string that does match is at least that long.
1983
1984         PCRE2_INFO_NAMECOUNT
1985         PCRE2_INFO_NAMEENTRYSIZE
1986         PCRE2_INFO_NAMETABLE
1987
1988       PCRE2 supports the use of named as well as numbered capturing parenthe‐
1989       ses. The names are just an additional way of identifying the  parenthe‐
1990       ses, which still acquire numbers. Several convenience functions such as
1991       pcre2_substring_get_byname() are provided for extracting captured  sub‐
1992       strings  by  name. It is also possible to extract the data directly, by
1993       first converting the name to a number in order to  access  the  correct
1994       pointers  in the output vector (described with pcre2_match() below). To
1995       do the conversion, you need to use the  name-to-number  map,  which  is
1996       described by these three values.
1997
1998       The  map  consists  of a number of fixed-size entries. PCRE2_INFO_NAME‐
1999       COUNT gives the number of entries, and  PCRE2_INFO_NAMEENTRYSIZE  gives
2000       the  size  of each entry in code units; both of these return a uint32_t
2001       value. The entry size depends on the length of the longest name.
2002
2003       PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
2004       This  is  a  PCRE2_SPTR  pointer to a block of code units. In the 8-bit
2005       library, the first two bytes of each entry are the number of  the  cap‐
2006       turing parenthesis, most significant byte first. In the 16-bit library,
2007       the pointer points to 16-bit code units, the first  of  which  contains
2008       the  parenthesis  number.  In the 32-bit library, the pointer points to
2009       32-bit code units, the first of which contains the parenthesis  number.
2010       The rest of the entry is the corresponding name, zero terminated.
2011
2012       The  names are in alphabetical order. If (?| is used to create multiple
2013       groups with the same number, as described in the section  on  duplicate
2014       subpattern  numbers  in  the pcre2pattern page, the groups may be given
2015       the same name, but there is only one  entry  in  the  table.  Different
2016       names for groups of the same number are not permitted.
2017
2018       Duplicate  names  for subpatterns with different numbers are permitted,
2019       but only if PCRE2_DUPNAMES is set. They appear  in  the  table  in  the
2020       order  in  which  they were found in the pattern. In the absence of (?|
2021       this is the order of increasing number; when (?| is used  this  is  not
2022       necessarily the case because later subpatterns may have lower numbers.
2023
2024       As  a  simple  example of the name/number table, consider the following
2025       pattern after compilation by the 8-bit library  (assume  PCRE2_EXTENDED
2026       is set, so white space - including newlines - is ignored):
2027
2028         (?<date> (?<year>(\d\d)?\d\d) -
2029         (?<month>\d\d) - (?<day>\d\d) )
2030
2031       There  are  four  named subpatterns, so the table has four entries, and
2032       each entry in the table is eight bytes long. The table is  as  follows,
2033       with non-printing bytes shows in hexadecimal, and undefined bytes shown
2034       as ??:
2035
2036         00 01 d  a  t  e  00 ??
2037         00 05 d  a  y  00 ?? ??
2038         00 04 m  o  n  t  h  00
2039         00 02 y  e  a  r  00 ??
2040
2041       When writing code to extract data  from  named  subpatterns  using  the
2042       name-to-number  map,  remember that the length of the entries is likely
2043       to be different for each compiled pattern.
2044
2045         PCRE2_INFO_NEWLINE
2046
2047       The output is one of the following uint32_t values:
2048
2049         PCRE2_NEWLINE_CR       Carriage return (CR)
2050         PCRE2_NEWLINE_LF       Linefeed (LF)
2051         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
2052         PCRE2_NEWLINE_ANY      Any Unicode line ending
2053         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
2054         PCRE2_NEWLINE_NUL      The NUL character (binary zero)
2055
2056       This identifies the character sequence that will be recognized as mean‐
2057       ing "newline" while matching.
2058
2059         PCRE2_INFO_SIZE
2060
2061       Return  the  size  of  the  compiled  pattern  in  bytes (for all three
2062       libraries). The third argument should point to a size_t variable.  This
2063       value  includes  the  size  of the general data block that precedes the
2064       code units of the compiled pattern itself. The value that is used  when
2065       pcre2_compile()  is  getting memory in which to place the compiled pat‐
2066       tern may be slightly larger than the value  returned  by  this  option,
2067       because  there are cases where the code that calculates the size has to
2068       over-estimate. Processing a pattern with  the  JIT  compiler  does  not
2069       alter the value returned by this option.
2070

INFORMATION ABOUT A PATTERN'S CALLOUTS

2072
2073       int pcre2_callout_enumerate(const pcre2_code *code,
2074         int (*callback)(pcre2_callout_enumerate_block *, void *),
2075         void *user_data);
2076
2077       A script language that supports the use of string arguments in callouts
2078       might like to scan all the callouts in a  pattern  before  running  the
2079       match. This can be done by calling pcre2_callout_enumerate(). The first
2080       argument is a pointer to a compiled pattern, the  second  points  to  a
2081       callback  function,  and the third is arbitrary user data. The callback
2082       function is called for every callout in the pattern  in  the  order  in
2083       which they appear. Its first argument is a pointer to a callout enumer‐
2084       ation block, and its second argument is the user_data  value  that  was
2085       passed  to  pcre2_callout_enumerate(). The contents of the callout enu‐
2086       meration block are described in the pcre2callout  documentation,  which
2087       also gives further details about callouts.
2088

SERIALIZATION AND PRECOMPILING

2090
2091       It  is  possible  to  save  compiled patterns on disc or elsewhere, and
2092       reload them later, subject to a number of  restrictions.  The  host  on
2093       which  the  patterns  are  reloaded must be running the same version of
2094       PCRE2, with the same code unit width, and must also have the same endi‐
2095       anness,  pointer  width,  and PCRE2_SIZE type. Before compiled patterns
2096       can be saved, they must be converted to a "serialized" form,  which  in
2097       the  case of PCRE2 is really just a bytecode dump.  The functions whose
2098       names begin with pcre2_serialize_ are used for converting to  and  from
2099       the  serialized form. They are described in the pcre2serialize documen‐
2100       tation. Note that PCRE2 serialization does not  convert  compiled  pat‐
2101       terns to an abstract format like Java or .NET serialization.
2102

THE MATCH DATA BLOCK

2104
2105       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
2106         pcre2_general_context *gcontext);
2107
2108       pcre2_match_data *pcre2_match_data_create_from_pattern(
2109         const pcre2_code *code, pcre2_general_context *gcontext);
2110
2111       void pcre2_match_data_free(pcre2_match_data *match_data);
2112
2113       Information  about  a  successful  or unsuccessful match is placed in a
2114       match data block, which is an opaque  structure  that  is  accessed  by
2115       function  calls.  In particular, the match data block contains a vector
2116       of offsets into the subject string that define the matched part of  the
2117       subject  and  any  substrings  that were captured. This is known as the
2118       ovector.
2119
2120       Before calling pcre2_match(), pcre2_dfa_match(),  or  pcre2_jit_match()
2121       you must create a match data block by calling one of the creation func‐
2122       tions above. For pcre2_match_data_create(), the first argument  is  the
2123       number  of  pairs  of  offsets  in  the ovector. One pair of offsets is
2124       required to identify the string that matched the whole pattern, with an
2125       additional  pair for each captured substring. For example, a value of 4
2126       creates enough space to record the matched portion of the subject  plus
2127       three  captured  substrings. A minimum of at least 1 pair is imposed by
2128       pcre2_match_data_create(), so it is always possible to return the over‐
2129       all matched string.
2130
2131       The second argument of pcre2_match_data_create() is a pointer to a gen‐
2132       eral context, which can specify custom memory management for  obtaining
2133       the memory for the match data block. If you are not using custom memory
2134       management, pass NULL, which causes malloc() to be used.
2135
2136       For pcre2_match_data_create_from_pattern(), the  first  argument  is  a
2137       pointer to a compiled pattern. The ovector is created to be exactly the
2138       right size to hold all the substrings a pattern might capture. The sec‐
2139       ond  argument is again a pointer to a general context, but in this case
2140       if NULL is passed, the memory is obtained using the same allocator that
2141       was used for the compiled pattern (custom or default).
2142
2143       A  match  data block can be used many times, with the same or different
2144       compiled patterns. You can extract information from a match data  block
2145       after  a  match  operation  has  finished,  using  functions  that  are
2146       described in the sections on  matched  strings  and  other  match  data
2147       below.
2148
2149       When  a  call  of  pcre2_match()  fails, valid data is available in the
2150       match   block   only   when   the   error    is    PCRE2_ERROR_NOMATCH,
2151       PCRE2_ERROR_PARTIAL,  or  one  of  the  error  codes for an invalid UTF
2152       string. Exactly what is available depends on the error, and is detailed
2153       below.
2154
2155       When  one of the matching functions is called, pointers to the compiled
2156       pattern and the subject string are set in the match data block so  that
2157       they  can  be  referenced  by the extraction functions. After running a
2158       match, you must not free a compiled pattern or a subject  string  until
2159       after  all  operations  on  the  match data block (for that match) have
2160       taken place.
2161
2162       When a match data block itself is no longer needed, it should be  freed
2163       by  calling  pcre2_match_data_free(). If this function is called with a
2164       NULL argument, it returns immediately, without doing anything.
2165

MATCHING A PATTERN: THE TRADITIONAL FUNCTION

2167
2168       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
2169         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2170         uint32_t options, pcre2_match_data *match_data,
2171         pcre2_match_context *mcontext);
2172
2173       The function pcre2_match() is called to match a subject string  against
2174       a  compiled pattern, which is passed in the code argument. You can call
2175       pcre2_match() with the same code argument as many times as you like, in
2176       order  to  find multiple matches in the subject string or to match dif‐
2177       ferent subject strings with the same pattern.
2178
2179       This function is the main matching facility  of  the  library,  and  it
2180       operates  in  a  Perl-like  manner. For specialist use there is also an
2181       alternative matching function, which is described below in the  section
2182       about the pcre2_dfa_match() function.
2183
2184       Here is an example of a simple call to pcre2_match():
2185
2186         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
2187         int rc = pcre2_match(
2188           re,             /* result of pcre2_compile() */
2189           "some string",  /* the subject string */
2190           11,             /* the length of the subject string */
2191           0,              /* start at offset 0 in the subject */
2192           0,              /* default options */
2193           md,             /* the match data block */
2194           NULL);          /* a match context; NULL means use defaults */
2195
2196       If  the  subject  string is zero-terminated, the length can be given as
2197       PCRE2_ZERO_TERMINATED. A match context must be provided if certain less
2198       common matching parameters are to be changed. For details, see the sec‐
2199       tion on the match context above.
2200
2201   The string to be matched by pcre2_match()
2202
2203       The subject string is passed to pcre2_match() as a pointer in  subject,
2204       a  length  in  length, and a starting offset in startoffset. The length
2205       and offset are in code units, not characters.  That  is,  they  are  in
2206       bytes  for the 8-bit library, 16-bit code units for the 16-bit library,
2207       and 32-bit code units for the 32-bit library, whether or not  UTF  pro‐
2208       cessing is enabled.
2209
2210       If startoffset is greater than the length of the subject, pcre2_match()
2211       returns PCRE2_ERROR_BADOFFSET. When the starting offset  is  zero,  the
2212       search  for a match starts at the beginning of the subject, and this is
2213       by far the most common case. In UTF-8 or UTF-16 mode, the starting off‐
2214       set  must  point to the start of a character, or to the end of the sub‐
2215       ject (in UTF-32 mode, one code unit equals one character, so  all  off‐
2216       sets  are  valid).  Like  the  pattern  string, the subject may contain
2217       binary zeros.
2218
2219       A non-zero starting offset is useful when searching for  another  match
2220       in  the  same  subject  by calling pcre2_match() again after a previous
2221       success.  Setting startoffset differs from  passing  over  a  shortened
2222       string  and  setting  PCRE2_NOTBOL in the case of a pattern that begins
2223       with any kind of lookbehind. For example, consider the pattern
2224
2225         \Biss\B
2226
2227       which finds occurrences of "iss" in the middle of  words.  (\B  matches
2228       only  if  the  current position in the subject is not a word boundary.)
2229       When applied to the string "Mississipi" the first call to pcre2_match()
2230       finds  the first occurrence. If pcre2_match() is called again with just
2231       the remainder of the subject,  namely  "issipi",  it  does  not  match,
2232       because \B is always false at the start of the subject, which is deemed
2233       to be a word boundary. However, if pcre2_match() is passed  the  entire
2234       string again, but with startoffset set to 4, it finds the second occur‐
2235       rence of "iss" because it is able to look behind the starting point  to
2236       discover that it is preceded by a letter.
2237
2238       Finding  all  the  matches  in a subject is tricky when the pattern can
2239       match an empty string. It is possible to emulate Perl's /g behaviour by
2240       first   trying   the   match   again  at  the  same  offset,  with  the
2241       PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED options,  and  then  if  that
2242       fails,  advancing  the  starting  offset  and  trying an ordinary match
2243       again. There is some code that demonstrates  how  to  do  this  in  the
2244       pcre2demo  sample  program. In the most general case, you have to check
2245       to see if the newline convention recognizes CRLF as a newline,  and  if
2246       so,  and the current character is CR followed by LF, advance the start‐
2247       ing offset by two characters instead of one.
2248
2249       If a non-zero starting offset is passed when the pattern is anchored, a
2250       single attempt to match at the given offset is made. This can only suc‐
2251       ceed if the pattern does not require the match to be at  the  start  of
2252       the  subject.  In other words, the anchoring must be the result of set‐
2253       ting the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL,  not
2254       by starting the pattern with ^ or \A.
2255
2256   Option bits for pcre2_match()
2257
2258       The unused bits of the options argument for pcre2_match() must be zero.
2259       The only bits that may be set  are  PCRE2_ANCHORED,  PCRE2_ENDANCHORED,
2260       PCRE2_NOTBOL,   PCRE2_NOTEOL,  PCRE2_NOTEMPTY,  PCRE2_NOTEMPTY_ATSTART,
2261       PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK,  PCRE2_PARTIAL_HARD,  and  PCRE2_PAR‐
2262       TIAL_SOFT.  Their action is described below.
2263
2264       Setting  PCRE2_ANCHORED  or PCRE2_ENDANCHORED at match time is not sup‐
2265       ported by the just-in-time (JIT) compiler. If it is set,  JIT  matching
2266       is  disabled  and  the interpretive code in pcre2_match() is run. Apart
2267       from PCRE2_NO_JIT (obviously), the remaining options are supported  for
2268       JIT matching.
2269
2270         PCRE2_ANCHORED
2271
2272       The PCRE2_ANCHORED option limits pcre2_match() to matching at the first
2273       matching position. If a pattern was compiled  with  PCRE2_ANCHORED,  or
2274       turned  out to be anchored by virtue of its contents, it cannot be made
2275       unachored at matching time. Note that setting the option at match  time
2276       disables JIT matching.
2277
2278         PCRE2_ENDANCHORED
2279
2280       If  the  PCRE2_ENDANCHORED option is set, any string that pcre2_match()
2281       matches must be right at the end of the subject string. Note that  set‐
2282       ting the option at match time disables JIT matching.
2283
2284         PCRE2_NOTBOL
2285
2286       This option specifies that first character of the subject string is not
2287       the beginning of a line, so the  circumflex  metacharacter  should  not
2288       match  before  it.  Setting  this without having set PCRE2_MULTILINE at
2289       compile time causes circumflex never to match. This option affects only
2290       the behaviour of the circumflex metacharacter. It does not affect \A.
2291
2292         PCRE2_NOTEOL
2293
2294       This option specifies that the end of the subject string is not the end
2295       of a line, so the dollar metacharacter should not match it nor  (except
2296       in  multiline mode) a newline immediately before it. Setting this with‐
2297       out having set PCRE2_MULTILINE at compile time causes dollar  never  to
2298       match. This option affects only the behaviour of the dollar metacharac‐
2299       ter. It does not affect \Z or \z.
2300
2301         PCRE2_NOTEMPTY
2302
2303       An empty string is not considered to be a valid match if this option is
2304       set.  If  there are alternatives in the pattern, they are tried. If all
2305       the alternatives match the empty string, the entire  match  fails.  For
2306       example, if the pattern
2307
2308         a?b?
2309
2310       is  applied  to  a  string not beginning with "a" or "b", it matches an
2311       empty string at the start of the subject. With PCRE2_NOTEMPTY set, this
2312       match  is  not valid, so pcre2_match() searches further into the string
2313       for occurrences of "a" or "b".
2314
2315         PCRE2_NOTEMPTY_ATSTART
2316
2317       This is like PCRE2_NOTEMPTY, except that it locks out an  empty  string
2318       match only at the first matching position, that is, at the start of the
2319       subject plus the starting offset. An empty string match  later  in  the
2320       subject  is  permitted.   If  the pattern is anchored, such a match can
2321       occur only if the pattern contains \K.
2322
2323         PCRE2_NO_JIT
2324
2325       By  default,  if  a  pattern  has  been   successfully   processed   by
2326       pcre2_jit_compile(),  JIT  is  automatically used when pcre2_match() is
2327       called with options that JIT supports.  Setting  PCRE2_NO_JIT  disables
2328       the use of JIT; it forces matching to be done by the interpreter.
2329
2330         PCRE2_NO_UTF_CHECK
2331
2332       When PCRE2_UTF is set at compile time, the validity of the subject as a
2333       UTF string is checked by default  when  pcre2_match()  is  subsequently
2334       called.   If  a non-zero starting offset is given, the check is applied
2335       only to that part of the subject that could be inspected during  match‐
2336       ing,  and there is a check that the starting offset points to the first
2337       code unit of a character or to the end of the subject. If there are  no
2338       lookbehind  assertions in the pattern, the check starts at the starting
2339       offset. Otherwise, it starts at the length of  the  longest  lookbehind
2340       before the starting offset, or at the start of the subject if there are
2341       not that many characters before the  starting  offset.  Note  that  the
2342       sequences \b and \B are one-character lookbehinds.
2343
2344       The check is carried out before any other processing takes place, and a
2345       negative error code is returned if the check fails. There  are  several
2346       UTF  error  codes  for each code unit width, corresponding to different
2347       problems with the code unit sequence. There are discussions  about  the
2348       validity  of  UTF-8  strings, UTF-16 strings, and UTF-32 strings in the
2349       pcre2unicode page.
2350
2351       If you know that your subject is valid, and  you  want  to  skip  these
2352       checks  for  performance  reasons,  you  can set the PCRE2_NO_UTF_CHECK
2353       option when calling pcre2_match(). You might want to do  this  for  the
2354       second and subsequent calls to pcre2_match() if you are making repeated
2355       calls to find other matches in the same subject string.
2356
2357       Warning: When PCRE2_NO_UTF_CHECK is  set,  the  effect  of  passing  an
2358       invalid  string  as  a  subject, or an invalid value of startoffset, is
2359       undefined.  Your program may crash or loop indefinitely.
2360
2361         PCRE2_PARTIAL_HARD
2362         PCRE2_PARTIAL_SOFT
2363
2364       These options turn on the partial matching  feature.  A  partial  match
2365       occurs  if  the  end of the subject string is reached successfully, but
2366       there are not enough subject characters to complete the match. If  this
2367       happens  when  PCRE2_PARTIAL_SOFT  (but not PCRE2_PARTIAL_HARD) is set,
2368       matching continues by testing any remaining alternatives.  Only  if  no
2369       complete  match can be found is PCRE2_ERROR_PARTIAL returned instead of
2370       PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT specifies  that
2371       the  caller  is prepared to handle a partial match, but only if no com‐
2372       plete match can be found.
2373
2374       If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In  this
2375       case,  if  a  partial match is found, pcre2_match() immediately returns
2376       PCRE2_ERROR_PARTIAL, without considering  any  other  alternatives.  In
2377       other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid‐
2378       ered to be more important that an alternative complete match.
2379
2380       There is a more detailed discussion of partial and multi-segment match‐
2381       ing, with examples, in the pcre2partial documentation.
2382

NEWLINE HANDLING WHEN MATCHING

2384
2385       When  PCRE2 is built, a default newline convention is set; this is usu‐
2386       ally the standard convention for the operating system. The default  can
2387       be  overridden  in a compile context by calling pcre2_set_newline(). It
2388       can also be overridden by starting a pattern string with, for  example,
2389       (*CRLF),  as  described  in  the  section on newline conventions in the
2390       pcre2pattern page. During matching, the newline choice affects the  be‐
2391       haviour  of the dot, circumflex, and dollar metacharacters. It may also
2392       alter the way the match starting position is  advanced  after  a  match
2393       failure for an unanchored pattern.
2394
2395       When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is
2396       set as the newline convention, and a match attempt  for  an  unanchored
2397       pattern fails when the current starting position is at a CRLF sequence,
2398       and the pattern contains no explicit matches for CR or  LF  characters,
2399       the  match  position  is  advanced by two characters instead of one, in
2400       other words, to after the CRLF.
2401
2402       The above rule is a compromise that makes the most common cases work as
2403       expected.  For  example,  if  the  pattern is .+A (and the PCRE2_DOTALL
2404       option is not set), it does not match the string "\r\nA" because, after
2405       failing  at the start, it skips both the CR and the LF before retrying.
2406       However, the pattern [\r\n]A does match that string,  because  it  con‐
2407       tains an explicit CR or LF reference, and so advances only by one char‐
2408       acter after the first failure.
2409
2410       An explicit match for CR of LF is either a literal appearance of one of
2411       those  characters  in the pattern, or one of the \r or \n or equivalent
2412       octal or hexadecimal escape sequences. Implicit matches such as [^X] do
2413       not  count, nor does \s, even though it includes CR and LF in the char‐
2414       acters that it matches.
2415
2416       Notwithstanding the above, anomalous effects may still occur when  CRLF
2417       is a valid newline sequence and explicit \r or \n escapes appear in the
2418       pattern.
2419

HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

2421
2422       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
2423
2424       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
2425
2426       In general, a pattern matches a certain portion of the subject, and  in
2427       addition,  further  substrings  from  the  subject may be picked out by
2428       parenthesized parts of the pattern.  Following  the  usage  in  Jeffrey
2429       Friedl's  book,  this  is  called  "capturing" in what follows, and the
2430       phrase "capturing subpattern" or "capturing group" is used for a  frag‐
2431       ment  of  a  pattern that picks out a substring. PCRE2 supports several
2432       other kinds of parenthesized subpattern that do not cause substrings to
2433       be  captured. The pcre2_pattern_info() function can be used to find out
2434       how many capturing subpatterns there are in a compiled pattern.
2435
2436       You can use auxiliary functions for accessing  captured  substrings  by
2437       number or by name, as described in sections below.
2438
2439       Alternatively, you can make direct use of the vector of PCRE2_SIZE val‐
2440       ues, called  the  ovector,  which  contains  the  offsets  of  captured
2441       strings.   It   is   part  of  the  match  data  block.   The  function
2442       pcre2_get_ovector_pointer() returns the address  of  the  ovector,  and
2443       pcre2_get_ovector_count() returns the number of pairs of values it con‐
2444       tains.
2445
2446       Within the ovector, the first in each pair of values is set to the off‐
2447       set of the first code unit of a substring, and the second is set to the
2448       offset of the first code unit after the end of a substring. These  val‐
2449       ues  are always code unit offsets, not character offsets. That is, they
2450       are byte offsets in the 8-bit library, 16-bit  offsets  in  the  16-bit
2451       library, and 32-bit offsets in the 32-bit library.
2452
2453       After  a  partial  match  (error  return PCRE2_ERROR_PARTIAL), only the
2454       first pair of offsets (that is, ovector[0]  and  ovector[1])  are  set.
2455       They  identify  the part of the subject that was partially matched. See
2456       the pcre2partial documentation for details of partial matching.
2457
2458       After a fully successful match, the first pair  of  offsets  identifies
2459       the  portion  of the subject string that was matched by the entire pat‐
2460       tern. The next pair is used for the first captured  substring,  and  so
2461       on.  The  value  returned by pcre2_match() is one more than the highest
2462       numbered pair that has been set. For example, if  two  substrings  have
2463       been  captured,  the returned value is 3. If there are no captured sub‐
2464       strings, the return value from a successful match is 1, indicating that
2465       just the first pair of offsets has been set.
2466
2467       If  a  pattern uses the \K escape sequence within a positive assertion,
2468       the reported start of a successful match can be greater than the end of
2469       the  match.   For  example,  if the pattern (?=ab\K) is matched against
2470       "ab", the start and end offset values for the match are 2 and 0.
2471
2472       If a capturing subpattern group is matched repeatedly within  a  single
2473       match  operation, it is the last portion of the subject that it matched
2474       that is returned.
2475
2476       If the ovector is too small to hold all the captured substring offsets,
2477       as  much  as possible is filled in, and the function returns a value of
2478       zero. If captured substrings are not of interest, pcre2_match() may  be
2479       called with a match data block whose ovector is of minimum length (that
2480       is, one pair).
2481
2482       It is possible for capturing subpattern number n+1 to match  some  part
2483       of the subject when subpattern n has not been used at all. For example,
2484       if the string "abc" is matched  against  the  pattern  (a|(z))(bc)  the
2485       return from the function is 4, and subpatterns 1 and 3 are matched, but
2486       2 is not. When this happens, both values in  the  offset  pairs  corre‐
2487       sponding to unused subpatterns are set to PCRE2_UNSET.
2488
2489       Offset  values  that correspond to unused subpatterns at the end of the
2490       expression are also set to PCRE2_UNSET.  For  example,  if  the  string
2491       "abc" is matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3
2492       are not matched.  The return from the function is 2, because the  high‐
2493       est used capturing subpattern number is 1. The offsets for for the sec‐
2494       ond and third capturing  subpatterns  (assuming  the  vector  is  large
2495       enough, of course) are set to PCRE2_UNSET.
2496
2497       Elements in the ovector that do not correspond to capturing parentheses
2498       in the pattern are never changed. That is, if a pattern contains n cap‐
2499       turing parentheses, no more than ovector[0] to ovector[2n+1] are set by
2500       pcre2_match(). The other elements retain whatever  values  they  previ‐
2501       ously  had.  After  a failed match attempt, the contents of the ovector
2502       are unchanged.
2503

OTHER INFORMATION ABOUT A MATCH

2505
2506       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
2507
2508       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
2509
2510       As well as the offsets in the ovector, other information about a  match
2511       is  retained  in the match data block and can be retrieved by the above
2512       functions in appropriate circumstances. If they  are  called  at  other
2513       times, the result is undefined.
2514
2515       After  a  successful match, a partial match (PCRE2_ERROR_PARTIAL), or a
2516       failure to match (PCRE2_ERROR_NOMATCH), a (*MARK), (*PRUNE), or (*THEN)
2517       name  may  be available. The function pcre2_get_mark() can be called to
2518       access this name. The same function applies  to  all  three  verbs.  It
2519       returns a pointer to the zero-terminated name, which is within the com‐
2520       piled pattern. If no name is available, NULL is returned. The length of
2521       the  name  (excluding  the terminating zero) is stored in the code unit
2522       that precedes the name. You should use this length instead  of  relying
2523       on the terminating zero if the name might contain a binary zero.
2524
2525       After  a  successful  match,  the  name  that  is  returned is the last
2526       (*MARK), (*PRUNE), or (*THEN) name encountered  on  the  matching  path
2527       through  the  pattern.  Instances of (*PRUNE) and (*THEN) without names
2528       are  ignored.  Thus,  for  example,  if  the  matching  path   contains
2529       (*MARK:A)(*PRUNE),  the  name "A" is returned.  After a "no match" or a
2530       partial match, the last encountered name  is  returned.   For  example,
2531       consider this pattern:
2532
2533         ^(*MARK:A)((*MARK:B)a|b)c
2534
2535       When  it  matches "bc", the returned name is A. The B mark is "seen" in
2536       the first branch of the group, but it is not on the matching  path.  On
2537       the  other  hand,  when  this pattern fails to match "bx", the returned
2538       name is B.
2539
2540       Warning: By default, certain start-of-match optimizations are  used  to
2541       give  a  fast "no match" result in some situations. For example, if the
2542       anchoring is removed from the pattern above, there is an initial  check
2543       for  the  presence  of  "c"  in the subject before running the matching
2544       engine. This check fails for "bx", causing a match failure without see‐
2545       ing any marks. You can disable the start-of-match optimizations by set‐
2546       ting the PCRE2_NO_START_OPTIMIZE option for pcre2_compile() or starting
2547       the pattern with (*NO_START_OPT).
2548
2549       After  a  successful  match, a partial match, or one of the invalid UTF
2550       errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar()  can
2551       be called. After a successful or partial match it returns the code unit
2552       offset of the character at which the match started. For  a  non-partial
2553       match,  this can be different to the value of ovector[0] if the pattern
2554       contains the \K escape sequence. After a partial match,  however,  this
2555       value  is  always the same as ovector[0] because \K does not affect the
2556       result of a partial match.
2557
2558       After a UTF check failure, pcre2_get_startchar() can be used to  obtain
2559       the code unit offset of the invalid UTF character. Details are given in
2560       the pcre2unicode page.
2561

ERROR RETURNS FROM pcre2_match()

2563
2564       If pcre2_match() fails, it returns a negative number. This can be  con‐
2565       verted  to a text string by calling the pcre2_get_error_message() func‐
2566       tion (see "Obtaining a textual error message" below).   Negative  error
2567       codes  are  also  returned  by other functions, and are documented with
2568       them. The codes are given names in the header file. If UTF checking  is
2569       in force and an invalid UTF subject string is detected, one of a number
2570       of UTF-specific negative error codes is returned. Details are given  in
2571       the  pcre2unicode  page. The following are the other errors that may be
2572       returned by pcre2_match():
2573
2574         PCRE2_ERROR_NOMATCH
2575
2576       The subject string did not match the pattern.
2577
2578         PCRE2_ERROR_PARTIAL
2579
2580       The subject string did not match, but it did match partially.  See  the
2581       pcre2partial documentation for details of partial matching.
2582
2583         PCRE2_ERROR_BADMAGIC
2584
2585       PCRE2 stores a 4-byte "magic number" at the start of the compiled code,
2586       to catch the case when it is passed a junk pointer. This is  the  error
2587       that is returned when the magic number is not present.
2588
2589         PCRE2_ERROR_BADMODE
2590
2591       This  error is given when a compiled pattern is passed to a function in
2592       a library of a different code unit width, for example, a  pattern  com‐
2593       piled  by  the  8-bit  library  is passed to a 16-bit or 32-bit library
2594       function.
2595
2596         PCRE2_ERROR_BADOFFSET
2597
2598       The value of startoffset was greater than the length of the subject.
2599
2600         PCRE2_ERROR_BADOPTION
2601
2602       An unrecognized bit was set in the options argument.
2603
2604         PCRE2_ERROR_BADUTFOFFSET
2605
2606       The UTF code unit sequence that was passed as a subject was checked and
2607       found  to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the
2608       value of startoffset did not point to the beginning of a UTF  character
2609       or the end of the subject.
2610
2611         PCRE2_ERROR_CALLOUT
2612
2613       This  error  is never generated by pcre2_match() itself. It is provided
2614       for use by callout  functions  that  want  to  cause  pcre2_match()  or
2615       pcre2_callout_enumerate()  to  return a distinctive error code. See the
2616       pcre2callout documentation for details.
2617
2618         PCRE2_ERROR_DEPTHLIMIT
2619
2620       The nested backtracking depth limit was reached.
2621
2622         PCRE2_ERROR_HEAPLIMIT
2623
2624       The heap limit was reached.
2625
2626         PCRE2_ERROR_INTERNAL
2627
2628       An unexpected internal error has occurred. This error could  be  caused
2629       by a bug in PCRE2 or by overwriting of the compiled pattern.
2630
2631         PCRE2_ERROR_JIT_STACKLIMIT
2632
2633       This  error  is  returned  when a pattern that was successfully studied
2634       using JIT is being matched, but the memory available for  the  just-in-
2635       time  processing stack is not large enough. See the pcre2jit documenta‐
2636       tion for more details.
2637
2638         PCRE2_ERROR_MATCHLIMIT
2639
2640       The backtracking match limit was reached.
2641
2642         PCRE2_ERROR_NOMEMORY
2643
2644       If a pattern contains many nested backtracking points, heap  memory  is
2645       used  to  remember them. This error is given when the memory allocation
2646       function (default or  custom)  fails.  Note  that  a  different  error,
2647       PCRE2_ERROR_HEAPLIMIT,  is given if the amount of memory needed exceeds
2648       the heap limit.
2649
2650         PCRE2_ERROR_NULL
2651
2652       Either the code, subject, or match_data argument was passed as NULL.
2653
2654         PCRE2_ERROR_RECURSELOOP
2655
2656       This error is returned when  pcre2_match()  detects  a  recursion  loop
2657       within  the  pattern. Specifically, it means that either the whole pat‐
2658       tern or a subpattern has been called recursively for the second time at
2659       the  same  position  in  the  subject string. Some simple patterns that
2660       might do this are detected and faulted at compile time, but  more  com‐
2661       plicated  cases,  in particular mutual recursions between two different
2662       subpatterns, cannot be detected until matching is attempted.
2663

OBTAINING A TEXTUAL ERROR MESSAGE

2665
2666       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
2667         PCRE2_SIZE bufflen);
2668
2669       A text message for an error code  from  any  PCRE2  function  (compile,
2670       match,  or  auxiliary)  can be obtained by calling pcre2_get_error_mes‐
2671       sage(). The code is passed as the first argument,  with  the  remaining
2672       two  arguments  specifying  a  code  unit buffer and its length in code
2673       units, into which the text message is placed. The message  is  returned
2674       in  code  units  of the appropriate width for the library that is being
2675       used.
2676
2677       The returned message is terminated with a trailing zero, and the  func‐
2678       tion  returns  the  number  of  code units used, excluding the trailing
2679       zero.  If  the  error  number  is  unknown,  the  negative  error  code
2680       PCRE2_ERROR_BADDATA  is  returned. If the buffer is too small, the mes‐
2681       sage is truncated (but still with a trailing zero),  and  the  negative
2682       error  code PCRE2_ERROR_NOMEMORY is returned.  None of the messages are
2683       very long; a buffer size of 120 code units is ample.
2684

EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

2686
2687       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
2688         uint32_t number, PCRE2_SIZE *length);
2689
2690       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
2691         uint32_t number, PCRE2_UCHAR *buffer,
2692         PCRE2_SIZE *bufflen);
2693
2694       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
2695         uint32_t number, PCRE2_UCHAR **bufferptr,
2696         PCRE2_SIZE *bufflen);
2697
2698       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2699
2700       Captured substrings can be accessed directly by using  the  ovector  as
2701       described above.  For convenience, auxiliary functions are provided for
2702       extracting  captured  substrings  as  new,  separate,   zero-terminated
2703       strings. A substring that contains a binary zero is correctly extracted
2704       and has a further zero added on the end, but  the  result  is  not,  of
2705       course, a C string.
2706
2707       The functions in this section identify substrings by number. The number
2708       zero refers to the entire matched substring, with higher numbers refer‐
2709       ring  to  substrings  captured by parenthesized groups. After a partial
2710       match, only substring zero is available.  An  attempt  to  extract  any
2711       other  substring  gives the error PCRE2_ERROR_PARTIAL. The next section
2712       describes similar functions for extracting captured substrings by name.
2713
2714       If a pattern uses the \K escape sequence within a  positive  assertion,
2715       the reported start of a successful match can be greater than the end of
2716       the match.  For example, if the pattern  (?=ab\K)  is  matched  against
2717       "ab",  the  start  and  end offset values for the match are 2 and 0. In
2718       this situation, calling these functions with a  zero  substring  number
2719       extracts a zero-length empty string.
2720
2721       You  can  find the length in code units of a captured substring without
2722       extracting it by calling pcre2_substring_length_bynumber().  The  first
2723       argument  is a pointer to the match data block, the second is the group
2724       number, and the third is a pointer to a variable into which the  length
2725       is  placed.  If  you just want to know whether or not the substring has
2726       been captured, you can pass the third argument as NULL.
2727
2728       The pcre2_substring_copy_bynumber() function  copies  a  captured  sub‐
2729       string  into  a supplied buffer, whereas pcre2_substring_get_bynumber()
2730       copies it into new memory, obtained using the  same  memory  allocation
2731       function  that  was  used for the match data block. The first two argu‐
2732       ments of these functions are a pointer to the match data  block  and  a
2733       capturing group number.
2734
2735       The final arguments of pcre2_substring_copy_bynumber() are a pointer to
2736       the buffer and a pointer to a variable that contains its length in code
2737       units.  This is updated to contain the actual number of code units used
2738       for the extracted substring, excluding the terminating zero.
2739
2740       For pcre2_substring_get_bynumber() the third and fourth arguments point
2741       to  variables that are updated with a pointer to the new memory and the
2742       number of code units that comprise the substring, again  excluding  the
2743       terminating  zero.  When  the substring is no longer needed, the memory
2744       should be freed by calling pcre2_substring_free().
2745
2746       The return value from all these functions is zero  for  success,  or  a
2747       negative  error  code.  If  the pattern match failed, the match failure
2748       code is returned.  If a substring number  greater  than  zero  is  used
2749       after  a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible
2750       error codes are:
2751
2752         PCRE2_ERROR_NOMEMORY
2753
2754       The buffer was too small for  pcre2_substring_copy_bynumber(),  or  the
2755       attempt to get memory failed for pcre2_substring_get_bynumber().
2756
2757         PCRE2_ERROR_NOSUBSTRING
2758
2759       There  is  no  substring  with that number in the pattern, that is, the
2760       number is greater than the number of capturing parentheses.
2761
2762         PCRE2_ERROR_UNAVAILABLE
2763
2764       The substring number, though not greater than the number of captures in
2765       the pattern, is greater than the number of slots in the ovector, so the
2766       substring could not be captured.
2767
2768         PCRE2_ERROR_UNSET
2769
2770       The substring did not participate in the match.  For  example,  if  the
2771       pattern  is  (abc)|(def) and the subject is "def", and the ovector con‐
2772       tains at least two capturing slots, substring number 1 is unset.
2773

EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

2775
2776       int pcre2_substring_list_get(pcre2_match_data *match_data,
2777         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
2778
2779       void pcre2_substring_list_free(PCRE2_SPTR *list);
2780
2781       The pcre2_substring_list_get() function  extracts  all  available  sub‐
2782       strings  and  builds  a  list of pointers to them. It also (optionally)
2783       builds a second list that  contains  their  lengths  (in  code  units),
2784       excluding a terminating zero that is added to each of them. All this is
2785       done in a single block of memory that is obtained using the same memory
2786       allocation function that was used to get the match data block.
2787
2788       This  function  must be called only after a successful match. If called
2789       after a partial match, the error code PCRE2_ERROR_PARTIAL is returned.
2790
2791       The address of the memory block is returned via listptr, which is  also
2792       the start of the list of string pointers. The end of the list is marked
2793       by a NULL pointer. The address of the list of lengths is  returned  via
2794       lengthsptr.  If your strings do not contain binary zeros and you do not
2795       therefore need the lengths, you may supply NULL as the lengthsptr argu‐
2796       ment  to  disable  the  creation of a list of lengths. The yield of the
2797       function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the  mem‐
2798       ory  block could not be obtained. When the list is no longer needed, it
2799       should be freed by calling pcre2_substring_list_free().
2800
2801       If this function encounters a substring that is unset, which can happen
2802       when  capturing subpattern number n+1 matches some part of the subject,
2803       but subpattern n has not been used at all, it returns an empty  string.
2804       This  can  be  distinguished  from  a  genuine zero-length substring by
2805       inspecting  the  appropriate  offset  in  the  ovector,  which  contain
2806       PCRE2_UNSET   for   unset   substrings,   or   by   calling  pcre2_sub‐
2807       string_length_bynumber().
2808

EXTRACTING CAPTURED SUBSTRINGS BY NAME

2810
2811       int pcre2_substring_number_from_name(const pcre2_code *code,
2812         PCRE2_SPTR name);
2813
2814       int pcre2_substring_length_byname(pcre2_match_data *match_data,
2815         PCRE2_SPTR name, PCRE2_SIZE *length);
2816
2817       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
2818         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
2819
2820       int pcre2_substring_get_byname(pcre2_match_data *match_data,
2821         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
2822
2823       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2824
2825       To extract a substring by name, you first have to find associated  num‐
2826       ber.  For example, for this pattern:
2827
2828         (a+)b(?<xxx>\d+)...
2829
2830       the number of the subpattern called "xxx" is 2. If the name is known to
2831       be unique (PCRE2_DUPNAMES was not set), you can find  the  number  from
2832       the name by calling pcre2_substring_number_from_name(). The first argu‐
2833       ment is the compiled pattern, and the second is the name. The yield  of
2834       the function is the subpattern number, PCRE2_ERROR_NOSUBSTRING if there
2835       is no subpattern of  that  name,  or  PCRE2_ERROR_NOUNIQUESUBSTRING  if
2836       there  is  more than one subpattern of that name. Given the number, you
2837       can extract the substring directly from the ovector, or use one of  the
2838       "bynumber" functions described above.
2839
2840       For  convenience,  there are also "byname" functions that correspond to
2841       the "bynumber" functions, the only difference  being  that  the  second
2842       argument  is  a  name instead of a number. If PCRE2_DUPNAMES is set and
2843       there are duplicate names, these functions scan all the groups with the
2844       given name, and return the first named string that is set.
2845
2846       If  there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
2847       returned. If all groups with the name have  numbers  that  are  greater
2848       than  the  number  of  slots in the ovector, PCRE2_ERROR_UNAVAILABLE is
2849       returned. If there is at least one group with a slot  in  the  ovector,
2850       but no group is found to be set, PCRE2_ERROR_UNSET is returned.
2851
2852       Warning: If the pattern uses the (?| feature to set up multiple subpat‐
2853       terns with the same number, as described in the  section  on  duplicate
2854       subpattern  numbers  in  the pcre2pattern page, you cannot use names to
2855       distinguish the different subpatterns, because names are  not  included
2856       in  the compiled code. The matching process uses only numbers. For this
2857       reason, the use of different names for subpatterns of the  same  number
2858       causes an error at compile time.
2859

CREATING A NEW STRING WITH SUBSTITUTIONS

2861
2862       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
2863         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2864         uint32_t options, pcre2_match_data *match_data,
2865         pcre2_match_context *mcontext, PCRE2_SPTR replacement,
2866         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferfP,
2867         PCRE2_SIZE *outlengthptr);
2868
2869       This  function calls pcre2_match() and then makes a copy of the subject
2870       string in outputbuffer, replacing the part that was  matched  with  the
2871       replacement  string,  whose  length is supplied in rlength. This can be
2872       given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
2873       which  a  \K item in a lookahead in the pattern causes the match to end
2874       before it starts are not supported, and give rise to an  error  return.
2875       For global replacements, matches in which \K in a lookbehind causes the
2876       match to start earlier than the point that was reached in the  previous
2877       iteration are also not supported.
2878
2879       The  first  seven  arguments  of pcre2_substitute() are the same as for
2880       pcre2_match(), except that the partial matching options are not permit‐
2881       ted,  and  match_data may be passed as NULL, in which case a match data
2882       block is obtained and freed within this function, using memory  manage‐
2883       ment  functions from the match context, if provided, or else those that
2884       were used to allocate memory for the compiled code.
2885
2886       If an external match_data block is provided,  its  contents  afterwards
2887       are those set by the final call to pcre2_match(), which will have ended
2888       in a matching error. The contents of the ovector within the match  data
2889       block may or may not have been changed.
2890
2891       The  outlengthptr  argument  must point to a variable that contains the
2892       length, in code units, of the output buffer. If the  function  is  suc‐
2893       cessful,  the value is updated to contain the length of the new string,
2894       excluding the trailing zero that is automatically added.
2895
2896       If the function is not  successful,  the  value  set  via  outlengthptr
2897       depends  on  the  type  of  error. For syntax errors in the replacement
2898       string, the value is the offset in the  replacement  string  where  the
2899       error  was  detected.  For  other  errors,  the value is PCRE2_UNSET by
2900       default. This includes the case of the output buffer being  too  small,
2901       unless  PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  is  set (see below), in which
2902       case the value is the minimum length needed, including  space  for  the
2903       trailing  zero.  Note  that  in  order  to compute the required length,
2904       pcre2_substitute() has  to  simulate  all  the  matching  and  copying,
2905       instead of giving an error return as soon as the buffer overflows. Note
2906       also that the length is in code units, not bytes.
2907
2908       In the replacement string, which is interpreted as a UTF string in  UTF
2909       mode,  and  is  checked  for UTF validity unless the PCRE2_NO_UTF_CHECK
2910       option is set, a dollar character is an escape character that can spec‐
2911       ify  the  insertion  of  characters  from  capturing groups or (*MARK),
2912       (*PRUNE), or (*THEN) items in the  pattern.  The  following  forms  are
2913       always recognized:
2914
2915         $$                  insert a dollar character
2916         $<n> or ${<n>}      insert the contents of group <n>
2917         $*MARK or ${*MARK}  insert a (*MARK), (*PRUNE), or (*THEN) name
2918
2919       Either  a  group  number  or  a  group name can be given for <n>. Curly
2920       brackets are required only if the following character would  be  inter‐
2921       preted as part of the number or name. The number may be zero to include
2922       the entire matched string.   For  example,  if  the  pattern  a(b)c  is
2923       matched  with "=abc=" and the replacement string "+$1$0$1+", the result
2924       is "=+babcb+=".
2925
2926       $*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or
2927       (*THEN)  on  the  matching  path  that  has a name. (*MARK) must always
2928       include a name, but (*PRUNE) and (*THEN) need not. For example, in  the
2929       case   of   (*MARK:A)(*PRUNE)   the  name  inserted  is  "A",  but  for
2930       (*MARK:A)(*PRUNE:B) the relevant name is "B".   This  facility  can  be
2931       used  to  perform  simple simultaneous substitutions, as this pcre2test
2932       example shows:
2933
2934         /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
2935             apple lemon
2936          2: pear orange
2937
2938       As well as the usual options for pcre2_match(), a number of  additional
2939       options can be set in the options argument of pcre2_substitute().
2940
2941       PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject
2942       string, replacing every matching substring. If this option is not  set,
2943       only  the  first matching substring is replaced. The search for matches
2944       takes place in the original subject string (that is, previous  replace‐
2945       ments  do  not  affect  it).  Iteration is implemented by advancing the
2946       startoffset value for each search, which is always  passed  the  entire
2947       subject string. If an offset limit is set in the match context, search‐
2948       ing stops when that limit is reached.
2949
2950       You can restrict the effect of a global substitution to  a  portion  of
2951       the subject string by setting either or both of startoffset and an off‐
2952       set limit. Here is a pcre2test example:
2953
2954         /B/g,replace=!,use_offset_limit
2955         ABC ABC ABC ABC\=offset=3,offset_limit=12
2956          2: ABC A!C A!C ABC
2957
2958       When continuing with global substitutions after  matching  a  substring
2959       with zero length, an attempt to find a non-empty match at the same off‐
2960       set is performed.  If this is not successful, the offset is advanced by
2961       one character except when CRLF is a valid newline sequence and the next
2962       two characters are CR, LF. In this case, the offset is advanced by  two
2963       characters.
2964
2965       PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  changes  what happens when the output
2966       buffer is too small. The default action is to return PCRE2_ERROR_NOMEM‐
2967       ORY  immediately.  If  this  option is set, however, pcre2_substitute()
2968       continues to go through the motions of matching and substituting (with‐
2969       out,  of course, writing anything) in order to compute the size of buf‐
2970       fer that is needed. This value is  passed  back  via  the  outlengthptr
2971       variable,    with    the   result   of   the   function   still   being
2972       PCRE2_ERROR_NOMEMORY.
2973
2974       Passing a buffer size of zero is a permitted way  of  finding  out  how
2975       much  memory  is needed for given substitution. However, this does mean
2976       that the entire operation is carried out twice. Depending on the appli‐
2977       cation,  it  may  be more efficient to allocate a large buffer and free
2978       the  excess  afterwards,  instead   of   using   PCRE2_SUBSTITUTE_OVER‐
2979       FLOW_LENGTH.
2980
2981       PCRE2_SUBSTITUTE_UNKNOWN_UNSET  causes  references  to capturing groups
2982       that do not appear in the pattern to be treated as unset  groups.  This
2983       option  should  be  used  with  care, because it means that a typo in a
2984       group name or  number  no  longer  causes  the  PCRE2_ERROR_NOSUBSTRING
2985       error.
2986
2987       PCRE2_SUBSTITUTE_UNSET_EMPTY  causes  unset capturing groups (including
2988       unknown  groups  when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)  to  be
2989       treated  as  empty  strings  when  inserted as described above. If this
2990       option is not set, an attempt to  insert  an  unset  group  causes  the
2991       PCRE2_ERROR_UNSET  error.  This  option does not influence the extended
2992       substitution syntax described below.
2993
2994       PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to  the
2995       replacement  string.  Without this option, only the dollar character is
2996       special, and only the group insertion forms  listed  above  are  valid.
2997       When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
2998
2999       Firstly,  backslash in a replacement string is interpreted as an escape
3000       character. The usual forms such as \n or \x{ddd} can be used to specify
3001       particular  character codes, and backslash followed by any non-alphanu‐
3002       meric character quotes that character. Extended quoting  can  be  coded
3003       using \Q...\E, exactly as in pattern strings.
3004
3005       There  are  also four escape sequences for forcing the case of inserted
3006       letters.  The insertion mechanism has three states:  no  case  forcing,
3007       force upper case, and force lower case. The escape sequences change the
3008       current state: \U and \L change to upper or lower case forcing, respec‐
3009       tively,  and  \E (when not terminating a \Q quoted sequence) reverts to
3010       no case forcing. The sequences \u and \l force the next  character  (if
3011       it  is  a  letter)  to  upper or lower case, respectively, and then the
3012       state automatically reverts to no case forcing. Case forcing applies to
3013       all inserted  characters, including those from captured groups and let‐
3014       ters within \Q...\E quoted sequences.
3015
3016       Note that case forcing sequences such as \U...\E do not nest. For exam‐
3017       ple,  the  result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
3018       \E has no effect.
3019
3020       The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to  add  more
3021       flexibility  to  group substitution. The syntax is similar to that used
3022       by Bash:
3023
3024         ${<n>:-<string>}
3025         ${<n>:+<string1>:<string2>}
3026
3027       As before, <n> may be a group number or a name. The first  form  speci‐
3028       fies  a  default  value. If group <n> is set, its value is inserted; if
3029       not, <string> is expanded and the  result  inserted.  The  second  form
3030       specifies  strings that are expanded and inserted when group <n> is set
3031       or unset, respectively. The first form is just a  convenient  shorthand
3032       for
3033
3034         ${<n>:+${<n>}:<string>}
3035
3036       Backslash  can  be  used to escape colons and closing curly brackets in
3037       the replacement strings. A change of the case forcing  state  within  a
3038       replacement  string  remains  in  force  afterwards,  as  shown in this
3039       pcre2test example:
3040
3041         /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
3042             body
3043          1: hello
3044             somebody
3045          1: HELLO
3046
3047       The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these  extended
3048       substitutions.   However,   PCRE2_SUBSTITUTE_UNKNOWN_UNSET  does  cause
3049       unknown groups in the extended syntax forms to be treated as unset.
3050
3051       If successful, pcre2_substitute() returns the  number  of  replacements
3052       that were made. This may be zero if no matches were found, and is never
3053       greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
3054
3055       In the event of an error, a negative error code is returned. Except for
3056       PCRE2_ERROR_NOMATCH    (which   is   never   returned),   errors   from
3057       pcre2_match() are passed straight back.
3058
3059       PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser‐
3060       tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
3061
3062       PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ‐
3063       ing an unknown substring when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)
3064       when  the  simple  (non-extended)  syntax  is  used  and  PCRE2_SUBSTI‐
3065       TUTE_UNSET_EMPTY is not set.
3066
3067       PCRE2_ERROR_NOMEMORY is returned  if  the  output  buffer  is  not  big
3068       enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size
3069       of buffer that is needed is returned via outlengthptr. Note  that  this
3070       does not happen by default.
3071
3072       PCRE2_ERROR_BADREPLACEMENT  is  used for miscellaneous syntax errors in
3073       the   replacement   string,   with   more   particular   errors   being
3074       PCRE2_ERROR_BADREPESCAPE  (invalid  escape  sequence), PCRE2_ERROR_REP‐
3075       MISSINGBRACE (closing curly bracket not found),  PCRE2_ERROR_BADSUBSTI‐
3076       TUTION   (syntax   error   in   extended   group   substitution),   and
3077       PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before  it  started
3078       or  the match started earlier than the current position in the subject,
3079       which can happen if \K is used in an assertion).
3080
3081       As for all PCRE2 errors, a text message that describes the error can be
3082       obtained   by   calling  the  pcre2_get_error_message()  function  (see
3083       "Obtaining a textual error message" above).
3084

DUPLICATE SUBPATTERN NAMES

3086
3087       int pcre2_substring_nametable_scan(const pcre2_code *code,
3088         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
3089
3090       When a pattern is compiled with the PCRE2_DUPNAMES  option,  names  for
3091       subpatterns  are  not required to be unique. Duplicate names are always
3092       allowed for subpatterns with the same number, created by using the  (?|
3093       feature.  Indeed,  if  such subpatterns are named, they are required to
3094       use the same names.
3095
3096       Normally, patterns with duplicate names are such that in any one match,
3097       only  one of the named subpatterns participates. An example is shown in
3098       the pcre2pattern documentation.
3099
3100       When  duplicates   are   present,   pcre2_substring_copy_byname()   and
3101       pcre2_substring_get_byname()  return  the first substring corresponding
3102       to  the  given  name  that  is  set.  Only   if   none   are   set   is
3103       PCRE2_ERROR_UNSET  is  returned. The pcre2_substring_number_from_name()
3104       function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
3105       duplicate names.
3106
3107       If  you want to get full details of all captured substrings for a given
3108       name, you must use the pcre2_substring_nametable_scan()  function.  The
3109       first  argument is the compiled pattern, and the second is the name. If
3110       the third and fourth arguments are NULL, the function returns  a  group
3111       number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
3112
3113       When the third and fourth arguments are not NULL, they must be pointers
3114       to variables that are updated by the function. After it has  run,  they
3115       point to the first and last entries in the name-to-number table for the
3116       given name, and the function returns the length of each entry  in  code
3117       units.  In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
3118       no entries for the given name.
3119
3120       The format of the name table is described above in the section entitled
3121       Information  about  a  pattern.  Given all the relevant entries for the
3122       name, you can extract each of their numbers,  and  hence  the  captured
3123       data.
3124

FINDING ALL POSSIBLE MATCHES AT ONE POSITION

3126
3127       The  traditional  matching  function  uses a similar algorithm to Perl,
3128       which stops when it finds the first match at a given point in the  sub‐
3129       ject. If you want to find all possible matches, or the longest possible
3130       match at a given position,  consider  using  the  alternative  matching
3131       function  (see  below) instead. If you cannot use the alternative func‐
3132       tion, you can kludge it up by making use of the callout facility, which
3133       is described in the pcre2callout documentation.
3134
3135       What you have to do is to insert a callout right at the end of the pat‐
3136       tern.  When your callout function is called, extract and save the  cur‐
3137       rent  matched  substring.  Then return 1, which forces pcre2_match() to
3138       backtrack and try other alternatives. Ultimately, when it runs  out  of
3139       matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
3140

MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

3142
3143       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
3144         PCRE2_SIZE length, PCRE2_SIZE startoffset,
3145         uint32_t options, pcre2_match_data *match_data,
3146         pcre2_match_context *mcontext,
3147         int *workspace, PCRE2_SIZE wscount);
3148
3149       The  function  pcre2_dfa_match()  is  called  to match a subject string
3150       against a compiled pattern, using a matching algorithm that  scans  the
3151       subject string just once (not counting lookaround assertions), and does
3152       not backtrack.  This has different characteristics to the normal  algo‐
3153       rithm,  and  is not compatible with Perl. Some of the features of PCRE2
3154       patterns are not supported.  Nevertheless, there are  times  when  this
3155       kind  of  matching  can be useful. For a discussion of the two matching
3156       algorithms, and a list of features that pcre2_dfa_match() does not sup‐
3157       port, see the pcre2matching documentation.
3158
3159       The  arguments  for  the pcre2_dfa_match() function are the same as for
3160       pcre2_match(), plus two extras. The ovector within the match data block
3161       is used in a different way, and this is described below. The other com‐
3162       mon arguments are used in the same way as for pcre2_match(),  so  their
3163       description is not repeated here.
3164
3165       The  two  additional  arguments provide workspace for the function. The
3166       workspace vector should contain at least 20 elements. It  is  used  for
3167       keeping  track  of  multiple  paths  through  the  pattern  tree.  More
3168       workspace is needed for patterns and subjects where there are a lot  of
3169       potential matches.
3170
3171       Here is an example of a simple call to pcre2_dfa_match():
3172
3173         int wspace[20];
3174         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
3175         int rc = pcre2_dfa_match(
3176           re,             /* result of pcre2_compile() */
3177           "some string",  /* the subject string */
3178           11,             /* the length of the subject string */
3179           0,              /* start at offset 0 in the subject */
3180           0,              /* default options */
3181           md,             /* the match data block */
3182           NULL,           /* a match context; NULL means use defaults */
3183           wspace,         /* working space vector */
3184           20);            /* number of elements (NOT size in bytes) */
3185
3186   Option bits for pcre_dfa_match()
3187
3188       The  unused  bits of the options argument for pcre2_dfa_match() must be
3189       zero. The only bits that may be set  are  PCRE2_ANCHORED,  PCRE2_ENDAN‐
3190       CHORED,        PCRE2_NOTBOL,        PCRE2_NOTEOL,       PCRE2_NOTEMPTY,
3191       PCRE2_NOTEMPTY_ATSTART,     PCRE2_NO_UTF_CHECK,     PCRE2_PARTIAL_HARD,
3192       PCRE2_PARTIAL_SOFT,  PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
3193       the last four of these are exactly the same as  for  pcre2_match(),  so
3194       their description is not repeated here.
3195
3196         PCRE2_PARTIAL_HARD
3197         PCRE2_PARTIAL_SOFT
3198
3199       These  have  the  same general effect as they do for pcre2_match(), but
3200       the details are slightly different. When PCRE2_PARTIAL_HARD is set  for
3201       pcre2_dfa_match(),  it  returns  PCRE2_ERROR_PARTIAL  if the end of the
3202       subject is reached and there is still at least one matching possibility
3203       that requires additional characters. This happens even if some complete
3204       matches have already been found. When PCRE2_PARTIAL_SOFT  is  set,  the
3205       return  code  PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
3206       if the end of the subject is  reached,  there  have  been  no  complete
3207       matches, but there is still at least one matching possibility. The por‐
3208       tion of the string that was inspected when the  longest  partial  match
3209       was found is set as the first matching string in both cases. There is a
3210       more detailed discussion of partial and  multi-segment  matching,  with
3211       examples, in the pcre2partial documentation.
3212
3213         PCRE2_DFA_SHORTEST
3214
3215       Setting  the PCRE2_DFA_SHORTEST option causes the matching algorithm to
3216       stop as soon as it has found one match. Because of the way the alterna‐
3217       tive  algorithm  works, this is necessarily the shortest possible match
3218       at the first possible matching point in the subject string.
3219
3220         PCRE2_DFA_RESTART
3221
3222       When pcre2_dfa_match() returns a partial match, it is possible to  call
3223       it again, with additional subject characters, and have it continue with
3224       the same match. The PCRE2_DFA_RESTART option requests this action; when
3225       it  is  set,  the workspace and wscount options must reference the same
3226       vector as before because data about the match so far is  left  in  them
3227       after a partial match. There is more discussion of this facility in the
3228       pcre2partial documentation.
3229
3230   Successful returns from pcre2_dfa_match()
3231
3232       When pcre2_dfa_match() succeeds, it may have matched more than one sub‐
3233       string in the subject. Note, however, that all the matches from one run
3234       of the function start at the same point in  the  subject.  The  shorter
3235       matches  are all initial substrings of the longer matches. For example,
3236       if the pattern
3237
3238         <.*>
3239
3240       is matched against the string
3241
3242         This is <something> <something else> <something further> no more
3243
3244       the three matched strings are
3245
3246         <something> <something else> <something further>
3247         <something> <something else>
3248         <something>
3249
3250       On success, the yield of the function is a number  greater  than  zero,
3251       which  is  the  number  of  matched substrings. The offsets of the sub‐
3252       strings are returned in the ovector, and can be extracted by number  in
3253       the  same way as for pcre2_match(), but the numbers bear no relation to
3254       any capturing groups that may exist in the pattern, because DFA  match‐
3255       ing does not support group capture.
3256
3257       Calls  to  the  convenience  functions  that extract substrings by name
3258       return the error PCRE2_ERROR_DFA_UFUNC (unsupported function)  if  used
3259       after a DFA match. The convenience functions that extract substrings by
3260       number never return PCRE2_ERROR_NOSUBSTRING.
3261
3262       The matched strings are stored in  the  ovector  in  reverse  order  of
3263       length;  that  is,  the longest matching string is first. If there were
3264       too many matches to fit into the ovector, the yield of the function  is
3265       zero, and the vector is filled with the longest matches.
3266
3267       NOTE:  PCRE2's  "auto-possessification" optimization usually applies to
3268       character repeats at the end of a pattern (as well as internally).  For
3269       example,  the pattern "a\d+" is compiled as if it were "a\d++". For DFA
3270       matching, this means that only one possible  match  is  found.  If  you
3271       really  do  want multiple matches in such cases, either use an ungreedy
3272       repeat such as "a\d+?" or set  the  PCRE2_NO_AUTO_POSSESS  option  when
3273       compiling.
3274
3275   Error returns from pcre2_dfa_match()
3276
3277       The pcre2_dfa_match() function returns a negative number when it fails.
3278       Many of the errors are the same  as  for  pcre2_match(),  as  described
3279       above.  There are in addition the following errors that are specific to
3280       pcre2_dfa_match():
3281
3282         PCRE2_ERROR_DFA_UITEM
3283
3284       This return is given if pcre2_dfa_match() encounters  an  item  in  the
3285       pattern  that it does not support, for instance, the use of \C in a UTF
3286       mode or a backreference.
3287
3288         PCRE2_ERROR_DFA_UCOND
3289
3290       This return is given if pcre2_dfa_match() encounters a  condition  item
3291       that uses a backreference for the condition, or a test for recursion in
3292       a specific group. These are not supported.
3293
3294         PCRE2_ERROR_DFA_WSSIZE
3295
3296       This return is given if pcre2_dfa_match() runs  out  of  space  in  the
3297       workspace vector.
3298
3299         PCRE2_ERROR_DFA_RECURSE
3300
3301       When  a  recursive subpattern is processed, the matching function calls
3302       itself recursively, using private memory for the ovector and workspace.
3303       This  error  is given if the internal ovector is not large enough. This
3304       should be extremely rare, as a vector of size 1000 is used.
3305
3306         PCRE2_ERROR_DFA_BADRESTART
3307
3308       When pcre2_dfa_match() is called  with  the  PCRE2_DFA_RESTART  option,
3309       some  plausibility  checks  are  made on the contents of the workspace,
3310       which should contain data about the previous partial match. If  any  of
3311       these checks fail, this error is given.
3312

SEE ALSO

3314
3315       pcre2build(3),    pcre2callout(3),    pcre2demo(3),   pcre2matching(3),
3316       pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).
3317

AUTHOR

3319
3320       Philip Hazel
3321       University Computing Service
3322       Cambridge, England.
3323

REVISION

3325
3326       Last updated: 07 September 2018
3327       Copyright (c) 1997-2018 University of Cambridge.
3328
3329
3330
3331PCRE2 10.32                    07 September 2018                   PCRE2API(3)
Impressum