1PCRE2API(3)                Library Functions Manual                PCRE2API(3)
2
3
4

NAME

6       PCRE2 - Perl-compatible regular expressions (revised API)
7
8       #include <pcre2.h>
9
10       PCRE2  is  a  new API for PCRE, starting at release 10.0. This document
11       contains a description of all its native functions. See the pcre2 docu‐
12       ment for an overview of all the PCRE2 documentation.
13

PCRE2 NATIVE API BASIC FUNCTIONS

15
16       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
17         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
18         pcre2_compile_context *ccontext);
19
20       void pcre2_code_free(pcre2_code *code);
21
22       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
23         pcre2_general_context *gcontext);
24
25       pcre2_match_data *pcre2_match_data_create_from_pattern(
26         const pcre2_code *code, pcre2_general_context *gcontext);
27
28       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
29         PCRE2_SIZE length, PCRE2_SIZE startoffset,
30         uint32_t options, pcre2_match_data *match_data,
31         pcre2_match_context *mcontext);
32
33       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
34         PCRE2_SIZE length, PCRE2_SIZE startoffset,
35         uint32_t options, pcre2_match_data *match_data,
36         pcre2_match_context *mcontext,
37         int *workspace, PCRE2_SIZE wscount);
38
39       void pcre2_match_data_free(pcre2_match_data *match_data);
40

PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS

42
43       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
44
45       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
46
47       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
48
49       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
50

PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS

52
53       pcre2_general_context *pcre2_general_context_create(
54         void *(*private_malloc)(PCRE2_SIZE, void *),
55         void (*private_free)(void *, void *), void *memory_data);
56
57       pcre2_general_context *pcre2_general_context_copy(
58         pcre2_general_context *gcontext);
59
60       void pcre2_general_context_free(pcre2_general_context *gcontext);
61

PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS

63
64       pcre2_compile_context *pcre2_compile_context_create(
65         pcre2_general_context *gcontext);
66
67       pcre2_compile_context *pcre2_compile_context_copy(
68         pcre2_compile_context *ccontext);
69
70       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
71
72       int pcre2_set_bsr(pcre2_compile_context *ccontext,
73         uint32_t value);
74
75       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
76         const uint8_t *tables);
77
78       int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
79         uint32_t extra_options);
80
81       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
82         PCRE2_SIZE value);
83
84       int pcre2_set_newline(pcre2_compile_context *ccontext,
85         uint32_t value);
86
87       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
88         uint32_t value);
89
90       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
91         int (*guard_function)(uint32_t, void *), void *user_data);
92

PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

94
95       pcre2_match_context *pcre2_match_context_create(
96         pcre2_general_context *gcontext);
97
98       pcre2_match_context *pcre2_match_context_copy(
99         pcre2_match_context *mcontext);
100
101       void pcre2_match_context_free(pcre2_match_context *mcontext);
102
103       int pcre2_set_callout(pcre2_match_context *mcontext,
104         int (*callout_function)(pcre2_callout_block *, void *),
105         void *callout_data);
106
107       int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
108         int (*callout_function)(pcre2_substitute_callout_block *, void *),
109         void *callout_data);
110
111       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
112         PCRE2_SIZE value);
113
114       int pcre2_set_heap_limit(pcre2_match_context *mcontext,
115         uint32_t value);
116
117       int pcre2_set_match_limit(pcre2_match_context *mcontext,
118         uint32_t value);
119
120       int pcre2_set_depth_limit(pcre2_match_context *mcontext,
121         uint32_t value);
122

PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS

124
125       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
126         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
127
128       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
129         uint32_t number, PCRE2_UCHAR *buffer,
130         PCRE2_SIZE *bufflen);
131
132       void pcre2_substring_free(PCRE2_UCHAR *buffer);
133
134       int pcre2_substring_get_byname(pcre2_match_data *match_data,
135         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
136
137       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
138         uint32_t number, PCRE2_UCHAR **bufferptr,
139         PCRE2_SIZE *bufflen);
140
141       int pcre2_substring_length_byname(pcre2_match_data *match_data,
142         PCRE2_SPTR name, PCRE2_SIZE *length);
143
144       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
145         uint32_t number, PCRE2_SIZE *length);
146
147       int pcre2_substring_nametable_scan(const pcre2_code *code,
148         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
149
150       int pcre2_substring_number_from_name(const pcre2_code *code,
151         PCRE2_SPTR name);
152
153       void pcre2_substring_list_free(PCRE2_SPTR *list);
154
155       int pcre2_substring_list_get(pcre2_match_data *match_data,
156         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
157

PCRE2 NATIVE API STRING SUBSTITUTION FUNCTION

159
160       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
161         PCRE2_SIZE length, PCRE2_SIZE startoffset,
162         uint32_t options, pcre2_match_data *match_data,
163         pcre2_match_context *mcontext, PCRE2_SPTR replacementz,
164         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
165         PCRE2_SIZE *outlengthptr);
166

PCRE2 NATIVE API JIT FUNCTIONS

168
169       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
170
171       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
172         PCRE2_SIZE length, PCRE2_SIZE startoffset,
173         uint32_t options, pcre2_match_data *match_data,
174         pcre2_match_context *mcontext);
175
176       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
177
178       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
179         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
180
181       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
182         pcre2_jit_callback callback_function, void *callback_data);
183
184       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
185

PCRE2 NATIVE API SERIALIZATION FUNCTIONS

187
188       int32_t pcre2_serialize_decode(pcre2_code **codes,
189         int32_t number_of_codes, const uint8_t *bytes,
190         pcre2_general_context *gcontext);
191
192       int32_t pcre2_serialize_encode(const pcre2_code **codes,
193         int32_t number_of_codes, uint8_t **serialized_bytes,
194         PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
195
196       void pcre2_serialize_free(uint8_t *bytes);
197
198       int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
199

PCRE2 NATIVE API AUXILIARY FUNCTIONS

201
202       pcre2_code *pcre2_code_copy(const pcre2_code *code);
203
204       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
205
206       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
207         PCRE2_SIZE bufflen);
208
209       const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);
210
211       void pcre2_maketables_free(pcre2_general_context *gcontext,
212         const uint8_t *tables);
213
214       int pcre2_pattern_info(const pcre2_code *code, uint32_t what,
215         void *where);
216
217       int pcre2_callout_enumerate(const pcre2_code *code,
218         int (*callback)(pcre2_callout_enumerate_block *, void *),
219         void *user_data);
220
221       int pcre2_config(uint32_t what, void *where);
222

PCRE2 NATIVE API OBSOLETE FUNCTIONS

224
225       int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
226         uint32_t value);
227
228       int pcre2_set_recursion_memory_management(
229         pcre2_match_context *mcontext,
230         void *(*private_malloc)(PCRE2_SIZE, void *),
231         void (*private_free)(void *, void *), void *memory_data);
232
233       These  functions became obsolete at release 10.30 and are retained only
234       for backward compatibility. They should not be used in  new  code.  The
235       first  is  replaced by pcre2_set_depth_limit(); the second is no longer
236       needed and has no effect (it always returns zero).
237

PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS

239
240       pcre2_convert_context *pcre2_convert_context_create(
241         pcre2_general_context *gcontext);
242
243       pcre2_convert_context *pcre2_convert_context_copy(
244         pcre2_convert_context *cvcontext);
245
246       void pcre2_convert_context_free(pcre2_convert_context *cvcontext);
247
248       int pcre2_set_glob_escape(pcre2_convert_context *cvcontext,
249         uint32_t escape_char);
250
251       int pcre2_set_glob_separator(pcre2_convert_context *cvcontext,
252         uint32_t separator_char);
253
254       int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length,
255         uint32_t options, PCRE2_UCHAR **buffer,
256         PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);
257
258       void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);
259
260       These functions provide a way of  converting  non-PCRE2  patterns  into
261       patterns  that  can  be  processed by pcre2_compile(). This facility is
262       experimental and may be changed in future releases. At present, "globs"
263       and  POSIX  basic  and  extended patterns can be converted. Details are
264       given in the pcre2convert documentation.
265

PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES

267
268       There are three PCRE2 libraries, supporting 8-bit, 16-bit,  and  32-bit
269       code  units,  respectively.  However,  there  is  just one header file,
270       pcre2.h.  This contains the function prototypes and  other  definitions
271       for all three libraries. One, two, or all three can be installed simul‐
272       taneously. On Unix-like systems the libraries  are  called  libpcre2-8,
273       libpcre2-16, and libpcre2-32, and they can also co-exist with the orig‐
274       inal PCRE libraries.
275
276       Character strings are passed to and from a PCRE2 library as a  sequence
277       of  unsigned  integers  in  code  units of the appropriate width. Every
278       PCRE2 function comes in three different forms, one  for  each  library,
279       for example:
280
281         pcre2_compile_8()
282         pcre2_compile_16()
283         pcre2_compile_32()
284
285       There are also three different sets of data types:
286
287         PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
288         PCRE2_SPTR8,  PCRE2_SPTR16,  PCRE2_SPTR32
289
290       The  UCHAR  types define unsigned code units of the appropriate widths.
291       For example, PCRE2_UCHAR16 is usually defined as `uint16_t'.  The  SPTR
292       types  are  constant  pointers  to the equivalent UCHAR types, that is,
293       they are pointers to vectors of unsigned code units.
294
295       Many applications use only one code unit width. For their  convenience,
296       macros are defined whose names are the generic forms such as pcre2_com‐
297       pile() and  PCRE2_SPTR.  These  macros  use  the  value  of  the  macro
298       PCRE2_CODE_UNIT_WIDTH  to generate the appropriate width-specific func‐
299       tion and macro names.  PCRE2_CODE_UNIT_WIDTH is not defined by default.
300       An  application  must  define  it  to  be 8, 16, or 32 before including
301       pcre2.h in order to make use of the generic names.
302
303       Applications that use more than one code unit width can be linked  with
304       more  than  one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to
305       be 0 before including pcre2.h, and then use the  real  function  names.
306       Any  code  that  is to be included in an environment where the value of
307       PCRE2_CODE_UNIT_WIDTH is unknown should  also  use  the  real  function
308       names. (Unfortunately, it is not possible in C code to save and restore
309       the value of a macro.)
310
311       If PCRE2_CODE_UNIT_WIDTH is not defined  before  including  pcre2.h,  a
312       compiler error occurs.
313
314       When  using  multiple  libraries  in an application, you must take care
315       when processing any particular pattern to use  only  functions  from  a
316       single  library.   For example, if you want to run a match using a pat‐
317       tern that was compiled with pcre2_compile_16(), you  must  do  so  with
318       pcre2_match_16(), not pcre2_match_8() or pcre2_match_32().
319
320       In  the  function summaries above, and in the rest of this document and
321       other PCRE2 documents, functions and data  types  are  described  using
322       their generic names, without the _8, _16, or _32 suffix.
323

PCRE2 API OVERVIEW

325
326       PCRE2  has  its  own  native  API, which is described in this document.
327       There are also some wrapper functions for the 8-bit library that corre‐
328       spond  to the POSIX regular expression API, but they do not give access
329       to all the functionality of PCRE2. They are described in the pcre2posix
330       documentation. Both these APIs define a set of C function calls.
331
332       The  native  API  C data types, function prototypes, option values, and
333       error codes are defined in the header file pcre2.h, which also contains
334       definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
335       numbers for the library. Applications can use these to include  support
336       for different releases of PCRE2.
337
338       In a Windows environment, if you want to statically link an application
339       program against a non-dll PCRE2 library, you must  define  PCRE2_STATIC
340       before including pcre2.h.
341
342       The  functions pcre2_compile() and pcre2_match() are used for compiling
343       and matching regular expressions in a Perl-compatible manner. A  sample
344       program that demonstrates the simplest way of using them is provided in
345       the file called pcre2demo.c in the PCRE2 source distribution. A listing
346       of  this  program  is  given  in  the  pcre2demo documentation, and the
347       pcre2sample documentation describes how to compile and run it.
348
349       The compiling and matching functions recognize various options that are
350       passed as bits in an options argument. There are also some more compli‐
351       cated  parameters  such  as  custom  memory  management  functions  and
352       resource  limits  that  are passed in "contexts" (which are just memory
353       blocks, described below). Simple applications do not need to  make  use
354       of contexts.
355
356       Just-in-time  (JIT)  compiler  support  is an optional feature of PCRE2
357       that can be built in  appropriate  hardware  environments.  It  greatly
358       speeds  up  the  matching  performance  of  many patterns. Programs can
359       request that it be used if  available  by  calling  pcre2_jit_compile()
360       after a pattern has been successfully compiled by pcre2_compile(). This
361       does nothing if JIT support is not available.
362
363       More complicated programs might need to  make  use  of  the  specialist
364       functions    pcre2_jit_stack_create(),    pcre2_jit_stack_free(),   and
365       pcre2_jit_stack_assign() in order to  control  the  JIT  code's  memory
366       usage.
367
368       JIT matching is automatically used by pcre2_match() if it is available,
369       unless the PCRE2_NO_JIT option is set. There is also a direct interface
370       for  JIT  matching,  which gives improved performance at the expense of
371       less sanity checking. The JIT-specific functions are discussed  in  the
372       pcre2jit documentation.
373
374       A  second  matching function, pcre2_dfa_match(), which is not Perl-com‐
375       patible, is also provided. This uses  a  different  algorithm  for  the
376       matching.  The  alternative  algorithm finds all possible matches (at a
377       given point in the subject), and scans the subject  just  once  (unless
378       there  are  lookaround  assertions).  However,  this algorithm does not
379       return captured substrings. A description of  the  two  matching  algo‐
380       rithms   and  their  advantages  and  disadvantages  is  given  in  the
381       pcre2matching   documentation.   There   is   no   JIT   support    for
382       pcre2_dfa_match().
383
384       In  addition  to  the  main compiling and matching functions, there are
385       convenience functions for extracting captured substrings from a subject
386       string that has been matched by pcre2_match(). They are:
387
388         pcre2_substring_copy_byname()
389         pcre2_substring_copy_bynumber()
390         pcre2_substring_get_byname()
391         pcre2_substring_get_bynumber()
392         pcre2_substring_list_get()
393         pcre2_substring_length_byname()
394         pcre2_substring_length_bynumber()
395         pcre2_substring_nametable_scan()
396         pcre2_substring_number_from_name()
397
398       pcre2_substring_free()  and  pcre2_substring_list_free()  are also pro‐
399       vided, to free memory used for extracted strings. If  either  of  these
400       functions  is called with a NULL argument, the function returns immedi‐
401       ately without doing anything.
402
403       The function pcre2_substitute() can be called to match  a  pattern  and
404       return  a  copy of the subject string with substitutions for parts that
405       were matched.
406
407       Functions whose names begin with pcre2_serialize_ are used  for  saving
408       compiled patterns on disc or elsewhere, and reloading them later.
409
410       Finally,  there  are functions for finding out information about a com‐
411       piled pattern (pcre2_pattern_info()) and about the  configuration  with
412       which PCRE2 was built (pcre2_config()).
413
414       Functions  with  names  ending with _free() are used for freeing memory
415       blocks of various sorts. In all cases, if one  of  these  functions  is
416       called with a NULL argument, it does nothing.
417

STRING LENGTHS AND OFFSETS

419
420       The  PCRE2  API  uses  string  lengths and offsets into strings of code
421       units in several places. These values are always  of  type  PCRE2_SIZE,
422       which  is an unsigned integer type, currently always defined as size_t.
423       The largest  value  that  can  be  stored  in  such  a  type  (that  is
424       ~(PCRE2_SIZE)0)  is reserved as a special indicator for zero-terminated
425       strings and unset offsets.  Therefore, the longest string that  can  be
426       handled is one less than this maximum.
427

NEWLINES

429
430       PCRE2 supports five different conventions for indicating line breaks in
431       strings: a single CR (carriage return) character, a  single  LF  (line‐
432       feed) character, the two-character sequence CRLF, any of the three pre‐
433       ceding, or any Unicode newline sequence. The Unicode newline  sequences
434       are  the  three just mentioned, plus the single characters VT (vertical
435       tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line
436       separator, U+2028), and PS (paragraph separator, U+2029).
437
438       Each  of  the first three conventions is used by at least one operating
439       system as its standard newline sequence. When PCRE2 is built, a default
440       can be specified.  If it is not, the default is set to LF, which is the
441       Unix standard. However, the newline convention can  be  changed  by  an
442       application  when  calling  pcre2_compile(),  or it can be specified by
443       special text at the start of the pattern  itself;  this  overrides  any
444       other  settings.  See  the pcre2pattern page for details of the special
445       character sequences.
446
447       In the PCRE2 documentation the word "newline"  is  used  to  mean  "the
448       character or pair of characters that indicate a line break". The choice
449       of newline convention affects the handling of the dot, circumflex,  and
450       dollar metacharacters, the handling of #-comments in /x mode, and, when
451       CRLF is a recognized line ending sequence, the match position  advance‐
452       ment for a non-anchored pattern. There is more detail about this in the
453       section on pcre2_match() options below.
454
455       The choice of newline convention does not affect the interpretation  of
456       the \n or \r escape sequences, nor does it affect what \R matches; this
457       has its own separate convention.
458

MULTITHREADING

460
461       In a multithreaded application it is important to keep  thread-specific
462       data  separate  from data that can be shared between threads. The PCRE2
463       library code itself is thread-safe: it contains  no  static  or  global
464       variables.  The  API  is  designed to be fairly simple for non-threaded
465       applications while at the same time ensuring that multithreaded  appli‐
466       cations can use it.
467
468       There are several different blocks of data that are used to pass infor‐
469       mation between the application and the PCRE2 libraries.
470
471   The compiled pattern
472
473       A pointer to the compiled form of a pattern is  returned  to  the  user
474       when pcre2_compile() is successful. The data in the compiled pattern is
475       fixed, and does not change when the pattern is matched.  Therefore,  it
476       is  thread-safe, that is, the same compiled pattern can be used by more
477       than one thread simultaneously. For example, an application can compile
478       all its patterns at the start, before forking off multiple threads that
479       use them. However, if the just-in-time (JIT)  optimization  feature  is
480       being  used,  it needs separate memory stack areas for each thread. See
481       the pcre2jit documentation for more details.
482
483       In a more complicated situation, where patterns are compiled only  when
484       they  are  first needed, but are still shared between threads, pointers
485       to compiled patterns must be protected  from  simultaneous  writing  by
486       multiple  threads. This is somewhat tricky to do correctly. If you know
487       that writing to a pointer is atomic in your environment,  you  can  use
488       logic like this:
489
490         Get a read-only (shared) lock (mutex) for pointer
491         if (pointer == NULL)
492           {
493           Get a write (unique) lock for pointer
494           if (pointer == NULL) pointer = pcre2_compile(...
495           }
496         Release the lock
497         Use pointer in pcre2_match()
498
499       Of  course,  testing  for compilation errors should also be included in
500       the code.
501
502       The reason for checking the pointer a second time is as  follows:  Sev‐
503       eral  threads  may have acquired the shared lock and tested the pointer
504       for being NULL, but only one of them will be given the write lock, with
505       the  rest kept waiting. The winning thread will compile the pattern and
506       store the result.  After this thread releases the write  lock,  another
507       thread  will  get it, and if it does not retest pointer for being NULL,
508       will recompile the pattern and overwrite the pointer, creating a memory
509       leak and possibly causing other issues.
510
511       In  an  environment  where  writing to a pointer may not be atomic, the
512       above logic is not sufficient. The thread that is doing  the  compiling
513       may  be descheduled after writing only part of the pointer, which could
514       cause other threads to use an invalid value. Instead  of  checking  the
515       pointer itself, a separate "pointer is valid" flag (that can be updated
516       atomically) must be used:
517
518         Get a read-only (shared) lock (mutex) for pointer
519         if (!pointer_is_valid)
520           {
521           Get a write (unique) lock for pointer
522           if (!pointer_is_valid)
523             {
524             pointer = pcre2_compile(...
525             pointer_is_valid = TRUE
526             }
527           }
528         Release the lock
529         Use pointer in pcre2_match()
530
531       If JIT is being used, but the JIT compilation is not being done immedi‐
532       ately  (perhaps  waiting  to  see if the pattern is used often enough),
533       similar logic is required. JIT compilation updates a value  within  the
534       compiled  code  block, so a thread must gain unique write access to the
535       pointer    before    calling    pcre2_jit_compile().     Alternatively,
536       pcre2_code_copy()  or  pcre2_code_copy_with_tables()  can  be  used  to
537       obtain a private copy of the compiled code before calling the JIT  com‐
538       piler.
539
540   Context blocks
541
542       The  next main section below introduces the idea of "contexts" in which
543       PCRE2 functions are called. A context is nothing more than a collection
544       of parameters that control the way PCRE2 operates. Grouping a number of
545       parameters together in a context is a convenient way of passing them to
546       a  PCRE2  function without using lots of arguments. The parameters that
547       are stored in contexts are in some sense  "advanced  features"  of  the
548       API. Many straightforward applications will not need to use contexts.
549
550       In a multithreaded application, if the parameters in a context are val‐
551       ues that are never changed, the same context can be  used  by  all  the
552       threads. However, if any thread needs to change any value in a context,
553       it must make its own thread-specific copy.
554
555   Match blocks
556
557       The matching functions need a block of memory for storing  the  results
558       of a match. This includes details of what was matched, as well as addi‐
559       tional information such as the name of a (*MARK) setting.  Each  thread
560       must provide its own copy of this memory.
561

PCRE2 CONTEXTS

563
564       Some  PCRE2  functions have a lot of parameters, many of which are used
565       only by specialist applications, for example,  those  that  use  custom
566       memory  management  or  non-standard character tables. To keep function
567       argument lists at a reasonable size, and at the same time to  keep  the
568       API  extensible,  "uncommon" parameters are passed to certain functions
569       in a context instead of directly. A context is just a block  of  memory
570       that  holds  the  parameter  values.   Applications that do not need to
571       adjust any of the context parameters  can  pass  NULL  when  a  context
572       pointer is required.
573
574       There  are  three different types of context: a general context that is
575       relevant for several PCRE2 operations, a compile-time  context,  and  a
576       match-time context.
577
578   The general context
579
580       At  present,  this  context  just  contains  pointers to (and data for)
581       external memory management  functions  that  are  called  from  several
582       places in the PCRE2 library. The context is named `general' rather than
583       specifically `memory' because in future other fields may be  added.  If
584       you  do not want to supply your own custom memory management functions,
585       you do not need to bother with a general context. A general context  is
586       created by:
587
588       pcre2_general_context *pcre2_general_context_create(
589         void *(*private_malloc)(PCRE2_SIZE, void *),
590         void (*private_free)(void *, void *), void *memory_data);
591
592       The  two  function pointers specify custom memory management functions,
593       whose prototypes are:
594
595         void *private_malloc(PCRE2_SIZE, void *);
596         void  private_free(void *, void *);
597
598       Whenever code in PCRE2 calls these functions, the final argument is the
599       value of memory_data. Either of the first two arguments of the creation
600       function may be NULL, in which case the system memory management  func‐
601       tions  malloc()  and free() are used. (This is not currently useful, as
602       there are no other fields in a general context,  but  in  future  there
603       might  be.)   The  private_malloc()  function  is used (if supplied) to
604       obtain memory for storing the context, and all three values  are  saved
605       as part of the context.
606
607       Whenever  PCRE2  creates a data block of any kind, the block contains a
608       pointer to the free() function that matches the malloc() function  that
609       was  used.  When  the  time  comes  to free the block, this function is
610       called.
611
612       A general context can be copied by calling:
613
614       pcre2_general_context *pcre2_general_context_copy(
615         pcre2_general_context *gcontext);
616
617       The memory used for a general context should be freed by calling:
618
619       void pcre2_general_context_free(pcre2_general_context *gcontext);
620
621       If this function is passed a  NULL  argument,  it  returns  immediately
622       without doing anything.
623
624   The compile context
625
626       A  compile context is required if you want to provide an external func‐
627       tion for stack checking during compilation or  to  change  the  default
628       values of any of the following compile-time parameters:
629
630         What \R matches (Unicode newlines or CR, LF, CRLF only)
631         PCRE2's character tables
632         The newline character sequence
633         The compile time nested parentheses limit
634         The maximum length of the pattern string
635         The extra options bits (none set by default)
636
637       A  compile context is also required if you are using custom memory man‐
638       agement.  If none of these apply, just pass NULL as the  context  argu‐
639       ment of pcre2_compile().
640
641       A  compile context is created, copied, and freed by the following func‐
642       tions:
643
644       pcre2_compile_context *pcre2_compile_context_create(
645         pcre2_general_context *gcontext);
646
647       pcre2_compile_context *pcre2_compile_context_copy(
648         pcre2_compile_context *ccontext);
649
650       void pcre2_compile_context_free(pcre2_compile_context *ccontext);
651
652       A compile context is created with default values  for  its  parameters.
653       These can be changed by calling the following functions, which return 0
654       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
655
656       int pcre2_set_bsr(pcre2_compile_context *ccontext,
657         uint32_t value);
658
659       The value must be PCRE2_BSR_ANYCRLF, to specify that  \R  matches  only
660       CR,  LF,  or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any
661       Unicode line ending sequence. The value is used by the JIT compiler and
662       by   the   two   interpreted   matching  functions,  pcre2_match()  and
663       pcre2_dfa_match().
664
665       int pcre2_set_character_tables(pcre2_compile_context *ccontext,
666         const uint8_t *tables);
667
668       The value must be the result of a  call  to  pcre2_maketables(),  whose
669       only argument is a general context. This function builds a set of char‐
670       acter tables in the current locale.
671
672       int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
673         uint32_t extra_options);
674
675       As PCRE2 has developed, almost all the 32 option bits that  are  avail‐
676       able  in  the options argument of pcre2_compile() have been used up. To
677       avoid running out, the compile context contains a set of  extra  option
678       bits  which are used for some newer, assumed rarer, options. This func‐
679       tion sets those bits. It always sets all the bits (either on  or  off).
680       It  does  not  modify  any  existing setting. The available options are
681       defined in the section entitled "Extra compile options" below.
682
683       int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
684         PCRE2_SIZE value);
685
686       This sets a maximum length, in code units, for any pattern string  that
687       is  compiled  with  this context. If the pattern is longer, an error is
688       generated.  This facility is provided so that applications that  accept
689       patterns from external sources can limit their size. The default is the
690       largest number that a PCRE2_SIZE variable can  hold,  which  is  effec‐
691       tively unlimited.
692
693       int pcre2_set_newline(pcre2_compile_context *ccontext,
694         uint32_t value);
695
696       This specifies which characters or character sequences are to be recog‐
697       nized as newlines. The value must be one of PCRE2_NEWLINE_CR  (carriage
698       return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
699       two-character sequence CR followed by LF),  PCRE2_NEWLINE_ANYCRLF  (any
700       of  the  above),  PCRE2_NEWLINE_ANY  (any Unicode newline sequence), or
701       PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero).
702
703       A pattern can override the value set in the compile context by starting
704       with a sequence such as (*CRLF). See the pcre2pattern page for details.
705
706       When    a    pattern   is   compiled   with   the   PCRE2_EXTENDED   or
707       PCRE2_EXTENDED_MORE option, the newline convention affects the recogni‐
708       tion  of  the  end  of  internal comments starting with #. The value is
709       saved with the compiled pattern for subsequent use by the JIT  compiler
710       and  by  the  two  interpreted  matching  functions,  pcre2_match() and
711       pcre2_dfa_match().
712
713       int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
714         uint32_t value);
715
716       This parameter adjusts the limit, set  when  PCRE2  is  built  (default
717       250),  on  the  depth  of  parenthesis nesting in a pattern. This limit
718       stops rogue patterns using up too much system  stack  when  being  com‐
719       piled.  The limit applies to parentheses of all kinds, not just captur‐
720       ing parentheses.
721
722       int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
723         int (*guard_function)(uint32_t, void *), void *user_data);
724
725       There is at least one application that runs PCRE2 in threads with  very
726       limited  system  stack,  where running out of stack is to be avoided at
727       all costs. The parenthesis limit above cannot take account of how  much
728       stack  is  actually  available during compilation. For a finer control,
729       you can supply a  function  that  is  called  whenever  pcre2_compile()
730       starts  to compile a parenthesized part of a pattern. This function can
731       check the actual stack size (or anything else  that  it  wants  to,  of
732       course).
733
734       The  first  argument to the callout function gives the current depth of
735       nesting, and the second is user data that is set up by the  last  argu‐
736       ment   of  pcre2_set_compile_recursion_guard().  The  callout  function
737       should return zero if all is well, or non-zero to force an error.
738
739   The match context
740
741       A match context is required if you want to:
742
743         Set up a callout function
744         Set an offset limit for matching an unanchored pattern
745         Change the limit on the amount of heap used when matching
746         Change the backtracking match limit
747         Change the backtracking depth limit
748         Set custom memory management specifically for the match
749
750       If none of these apply, just pass  NULL  as  the  context  argument  of
751       pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().
752
753       A  match  context  is created, copied, and freed by the following func‐
754       tions:
755
756       pcre2_match_context *pcre2_match_context_create(
757         pcre2_general_context *gcontext);
758
759       pcre2_match_context *pcre2_match_context_copy(
760         pcre2_match_context *mcontext);
761
762       void pcre2_match_context_free(pcre2_match_context *mcontext);
763
764       A match context is created with  default  values  for  its  parameters.
765       These can be changed by calling the following functions, which return 0
766       on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
767
768       int pcre2_set_callout(pcre2_match_context *mcontext,
769         int (*callout_function)(pcre2_callout_block *, void *),
770         void *callout_data);
771
772       This sets up a callout function for PCRE2 to call at  specified  points
773       during a matching operation. Details are given in the pcre2callout doc‐
774       umentation.
775
776       int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
777         int (*callout_function)(pcre2_substitute_callout_block *, void *),
778         void *callout_data);
779
780       This sets up a callout function for PCRE2 to call after each  substitu‐
781       tion made by pcre2_substitute(). Details are given in the section enti‐
782       tled "Creating a new string with substitutions" below.
783
784       int pcre2_set_offset_limit(pcre2_match_context *mcontext,
785         PCRE2_SIZE value);
786
787       The offset_limit parameter limits how  far  an  unanchored  search  can
788       advance  in  the  subject string. The default value is PCRE2_UNSET. The
789       pcre2_match()     and      pcre2_dfa_match()      functions      return
790       PCRE2_ERROR_NOMATCH  if  a match with a starting point before or at the
791       given offset is not found. The  pcre2_substitute()  function  makes  no
792       more substitutions.
793
794       For  example,  if the pattern /abc/ is matched against "123abc" with an
795       offset limit less than 3, the result is  PCRE2_ERROR_NOMATCH.  A  match
796       can  never  be  found  if  the  startoffset  argument of pcre2_match(),
797       pcre2_dfa_match(), or pcre2_substitute() is  greater  than  the  offset
798       limit set in the match context.
799
800       When  using  this  facility,  you  must  set the PCRE2_USE_OFFSET_LIMIT
801       option when calling pcre2_compile() so that when JIT is in use, differ‐
802       ent  code  can  be  compiled.  If a match is started with a non-default
803       match limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is  gener‐
804       ated.
805
806       The  offset limit facility can be used to track progress when searching
807       large subject strings or to limit the extent of  global  substitutions.
808       See  also  the  PCRE2_FIRSTLINE option, which requires a match to start
809       before or at the first newline that follows the start  of  matching  in
810       the subject. If this is set with an offset limit, a match must occur in
811       the first line and also within the offset limit. In other words, which‐
812       ever limit comes first is used.
813
814       int pcre2_set_heap_limit(pcre2_match_context *mcontext,
815         uint32_t value);
816
817       The heap_limit parameter specifies, in units of kibibytes (1024 bytes),
818       the maximum amount of heap memory that pcre2_match() may  use  to  hold
819       backtracking information when running an interpretive match. This limit
820       also applies to pcre2_dfa_match(), which may use the heap when process‐
821       ing  patterns  with a lot of nested pattern recursion or lookarounds or
822       atomic groups. This limit does not apply to matching with the JIT opti‐
823       mization,  which  has  its  own  memory  control  arrangements (see the
824       pcre2jit documentation for more details). If the limit is reached,  the
825       negative  error  code  PCRE2_ERROR_HEAPLIMIT  is  returned. The default
826       limit can be set when PCRE2 is built; if it is not, the default is  set
827       very large and is essentially "unlimited".
828
829       A value for the heap limit may also be supplied by an item at the start
830       of a pattern of the form
831
832         (*LIMIT_HEAP=ddd)
833
834       where ddd is a decimal number.  However,  such  a  setting  is  ignored
835       unless  ddd  is  less than the limit set by the caller of pcre2_match()
836       or, if no such limit is set, less than the default.
837
838       The pcre2_match() function starts out using a 20KiB vector on the  sys‐
839       tem stack for recording backtracking points. The more nested backtrack‐
840       ing points there are (that is, the deeper the search  tree),  the  more
841       memory  is  needed.   Heap memory is used only if the initial vector is
842       too small. If the heap limit is set to a value less than 21 (in partic‐
843       ular,  zero)  no  heap memory will be used. In this case, only patterns
844       that do not have a lot of nested backtracking can be successfully  pro‐
845       cessed.
846
847       Similarly,  for pcre2_dfa_match(), a vector on the system stack is used
848       when processing pattern recursions, lookarounds, or atomic groups,  and
849       only  if this is not big enough is heap memory used. In this case, too,
850       setting a value of zero disables the use of the heap.
851
852       int pcre2_set_match_limit(pcre2_match_context *mcontext,
853         uint32_t value);
854
855       The match_limit parameter provides a means  of  preventing  PCRE2  from
856       using up too many computing resources when processing patterns that are
857       not going to match, but which have a very large number of possibilities
858       in  their  search  trees.  The  classic  example is a pattern that uses
859       nested unlimited repeats.
860
861       There is an internal counter in pcre2_match() that is incremented  each
862       time  round  its  main  matching  loop. If this value reaches the match
863       limit, pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT.
864       This  has  the  effect  of limiting the amount of backtracking that can
865       take place. For patterns that are not anchored, the count restarts from
866       zero  for  each position in the subject string. This limit also applies
867       to pcre2_dfa_match(), though the counting is done in a different way.
868
869       When pcre2_match() is called with a pattern that was successfully  pro‐
870       cessed by pcre2_jit_compile(), the way in which matching is executed is
871       entirely different. However, there is still the possibility of  runaway
872       matching  that  goes  on  for  a very long time, and so the match_limit
873       value is also used in this case (but in a different way) to  limit  how
874       long the matching can continue.
875
876       The  default  value  for  the limit can be set when PCRE2 is built; the
877       default default is 10 million, which handles all but the  most  extreme
878       cases.  A  value for the match limit may also be supplied by an item at
879       the start of a pattern of the form
880
881         (*LIMIT_MATCH=ddd)
882
883       where ddd is a decimal number.  However,  such  a  setting  is  ignored
884       unless ddd is less than the limit set by the caller of pcre2_match() or
885       pcre2_dfa_match() or, if no such limit is set, less than the default.
886
887       int pcre2_set_depth_limit(pcre2_match_context *mcontext,
888         uint32_t value);
889
890       This  parameter  limits   the   depth   of   nested   backtracking   in
891       pcre2_match().   Each time a nested backtracking point is passed, a new
892       memory "frame" is used to remember the state of matching at that point.
893       Thus,  this  parameter  indirectly  limits the amount of memory that is
894       used in a match. However, because  the  size  of  each  memory  "frame"
895       depends on the number of capturing parentheses, the actual memory limit
896       varies from pattern to pattern. This limit was more useful in  versions
897       before 10.30, where function recursion was used for backtracking.
898
899       The  depth limit is not relevant, and is ignored, when matching is done
900       using JIT compiled code. However, it is supported by pcre2_dfa_match(),
901       which  uses it to limit the depth of nested internal recursive function
902       calls that implement atomic groups, lookaround assertions, and  pattern
903       recursions. This limits, indirectly, the amount of system stack that is
904       used. It was more useful in versions before 10.32,  when  stack  memory
905       was used for local workspace vectors for recursive function calls. From
906       version 10.32, only local variables are allocated on the stack  and  as
907       each call uses only a few hundred bytes, even a small stack can support
908       quite a lot of recursion.
909
910       If the depth of internal recursive  function  calls  is  great  enough,
911       local  workspace  vectors  are allocated on the heap from version 10.32
912       onwards, so the depth limit also indirectly limits the amount  of  heap
913       memory that is used. A recursive pattern such as /(.(?2))((?1)|)/, when
914       matched to a very long string using pcre2_dfa_match(), can use a  great
915       deal  of  memory.  However,  it  is probably better to limit heap usage
916       directly by calling pcre2_set_heap_limit().
917
918       The default value for the depth limit can be set when PCRE2  is  built;
919       if  it  is not, the default is set to the same value as the default for
920       the  match  limit.   If  the  limit  is  exceeded,   pcre2_match()   or
921       pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth
922       limit may also be supplied by an item at the start of a pattern of  the
923       form
924
925         (*LIMIT_DEPTH=ddd)
926
927       where  ddd  is  a  decimal  number.  However, such a setting is ignored
928       unless ddd is less than the limit set by the caller of pcre2_match() or
929       pcre2_dfa_match() or, if no such limit is set, less than the default.
930

CHECKING BUILD-TIME OPTIONS

932
933       int pcre2_config(uint32_t what, void *where);
934
935       The  function  pcre2_config()  makes  it possible for a PCRE2 client to
936       find the value of certain  configuration  parameters  and  to  discover
937       which  optional features have been compiled into the PCRE2 library. The
938       pcre2build documentation has more details about these features.
939
940       The first argument for pcre2_config() specifies  which  information  is
941       required.  The  second  argument  is a pointer to memory into which the
942       information is placed. If NULL is  passed,  the  function  returns  the
943       amount  of  memory  that  is  needed for the requested information. For
944       calls that return  numerical  values,  the  value  is  in  bytes;  when
945       requesting  these  values,  where should point to appropriately aligned
946       memory. For calls that return strings, the required length is given  in
947       code units, not counting the terminating zero.
948
949       When  requesting information, the returned value from pcre2_config() is
950       non-negative on success, or the negative error code  PCRE2_ERROR_BADOP‐
951       TION  if the value in the first argument is not recognized. The follow‐
952       ing information is available:
953
954         PCRE2_CONFIG_BSR
955
956       The output is a uint32_t integer whose value indicates  what  character
957       sequences  the  \R  escape  sequence  matches  by  default.  A value of
958       PCRE2_BSR_UNICODE  means  that  \R  matches  any  Unicode  line  ending
959       sequence;  a  value of PCRE2_BSR_ANYCRLF means that \R matches only CR,
960       LF, or CRLF. The default can be overridden when a pattern is compiled.
961
962         PCRE2_CONFIG_COMPILED_WIDTHS
963
964       The output is a uint32_t integer whose lower bits indicate  which  code
965       unit  widths  were  selected  when PCRE2 was built. The 1-bit indicates
966       8-bit support, and the 2-bit and 4-bit indicate 16-bit and 32-bit  sup‐
967       port, respectively.
968
969         PCRE2_CONFIG_DEPTHLIMIT
970
971       The  output  is a uint32_t integer that gives the default limit for the
972       depth of nested backtracking in pcre2_match() or the  depth  of  nested
973       recursions,  lookarounds,  and atomic groups in pcre2_dfa_match(). Fur‐
974       ther details are given with pcre2_set_depth_limit() above.
975
976         PCRE2_CONFIG_HEAPLIMIT
977
978       The output is a uint32_t integer that gives, in kibibytes, the  default
979       limit   for  the  amount  of  heap  memory  used  by  pcre2_match()  or
980       pcre2_dfa_match().     Further     details     are      given      with
981       pcre2_set_heap_limit() above.
982
983         PCRE2_CONFIG_JIT
984
985       The  output  is  a  uint32_t  integer that is set to one if support for
986       just-in-time compiling is available; otherwise it is set to zero.
987
988         PCRE2_CONFIG_JITTARGET
989
990       The where argument should point to a buffer that is at  least  48  code
991       units  long.  (The  exact  length  required  can  be  found  by calling
992       pcre2_config() with where set to NULL.) The buffer  is  filled  with  a
993       string  that  contains  the  name of the architecture for which the JIT
994       compiler is  configured,  for  example  "x86  32bit  (little  endian  +
995       unaligned)".  If JIT support is not available, PCRE2_ERROR_BADOPTION is
996       returned, otherwise the number of code units used is returned. This  is
997       the length of the string, plus one unit for the terminating zero.
998
999         PCRE2_CONFIG_LINKSIZE
1000
1001       The output is a uint32_t integer that contains the number of bytes used
1002       for internal linkage in compiled regular  expressions.  When  PCRE2  is
1003       configured,  the value can be set to 2, 3, or 4, with the default being
1004       2. This is the value that is returned by pcre2_config(). However,  when
1005       the  16-bit  library  is compiled, a value of 3 is rounded up to 4, and
1006       when the 32-bit library is compiled, internal  linkages  always  use  4
1007       bytes, so the configured value is not relevant.
1008
1009       The default value of 2 for the 8-bit and 16-bit libraries is sufficient
1010       for all but the most massive patterns, since it allows the size of  the
1011       compiled  pattern  to  be  up  to 65535 code units. Larger values allow
1012       larger regular expressions to be compiled by those two  libraries,  but
1013       at the expense of slower matching.
1014
1015         PCRE2_CONFIG_MATCHLIMIT
1016
1017       The output is a uint32_t integer that gives the default match limit for
1018       pcre2_match(). Further details are given  with  pcre2_set_match_limit()
1019       above.
1020
1021         PCRE2_CONFIG_NEWLINE
1022
1023       The  output  is  a  uint32_t  integer whose value specifies the default
1024       character sequence that is recognized as meaning "newline". The  values
1025       are:
1026
1027         PCRE2_NEWLINE_CR       Carriage return (CR)
1028         PCRE2_NEWLINE_LF       Linefeed (LF)
1029         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
1030         PCRE2_NEWLINE_ANY      Any Unicode line ending
1031         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
1032         PCRE2_NEWLINE_NUL      The NUL character (binary zero)
1033
1034       The  default  should  normally  correspond to the standard sequence for
1035       your operating system.
1036
1037         PCRE2_CONFIG_NEVER_BACKSLASH_C
1038
1039       The output is a uint32_t integer that is set to one if the  use  of  \C
1040       was  permanently  disabled when PCRE2 was built; otherwise it is set to
1041       zero.
1042
1043         PCRE2_CONFIG_PARENSLIMIT
1044
1045       The output is a uint32_t integer that gives the maximum depth of  nest‐
1046       ing of parentheses (of any kind) in a pattern. This limit is imposed to
1047       cap the amount of system stack used when a pattern is compiled.  It  is
1048       specified  when PCRE2 is built; the default is 250. This limit does not
1049       take into account the stack that may already be  used  by  the  calling
1050       application.  For  finer  control  over  compilation  stack  usage, see
1051       pcre2_set_compile_recursion_guard().
1052
1053         PCRE2_CONFIG_STACKRECURSE
1054
1055       This parameter is obsolete and should not be used in new code. The out‐
1056       put is a uint32_t integer that is always set to zero.
1057
1058         PCRE2_CONFIG_TABLES_LENGTH
1059
1060       The output is a uint32_t integer that gives the length of PCRE2's char‐
1061       acter processing tables in bytes. For details of these tables  see  the
1062       section on locale support below.
1063
1064         PCRE2_CONFIG_UNICODE_VERSION
1065
1066       The  where  argument  should point to a buffer that is at least 24 code
1067       units long.  (The  exact  length  required  can  be  found  by  calling
1068       pcre2_config()  with  where  set  to  NULL.) If PCRE2 has been compiled
1069       without Unicode support, the buffer is filled with  the  text  "Unicode
1070       not  supported".  Otherwise,  the  Unicode version string (for example,
1071       "8.0.0") is inserted. The number of code units used is  returned.  This
1072       is the length of the string plus one unit for the terminating zero.
1073
1074         PCRE2_CONFIG_UNICODE
1075
1076       The  output is a uint32_t integer that is set to one if Unicode support
1077       is available; otherwise it is set to zero. Unicode support implies  UTF
1078       support.
1079
1080         PCRE2_CONFIG_VERSION
1081
1082       The  where  argument  should point to a buffer that is at least 24 code
1083       units long.  (The  exact  length  required  can  be  found  by  calling
1084       pcre2_config()  with  where set to NULL.) The buffer is filled with the
1085       PCRE2 version string, zero-terminated. The number of code units used is
1086       returned. This is the length of the string plus one unit for the termi‐
1087       nating zero.
1088

COMPILING A PATTERN

1090
1091       pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
1092         uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
1093         pcre2_compile_context *ccontext);
1094
1095       void pcre2_code_free(pcre2_code *code);
1096
1097       pcre2_code *pcre2_code_copy(const pcre2_code *code);
1098
1099       pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
1100
1101       The pcre2_compile() function compiles a pattern into an internal  form.
1102       The  pattern  is  defined  by a pointer to a string of code units and a
1103       length (in code units). If the pattern is zero-terminated,  the  length
1104       can  be  specified  as  PCRE2_ZERO_TERMINATED.  The  function returns a
1105       pointer to a block of memory that contains  the  compiled  pattern  and
1106       related data, or NULL if an error occurred.
1107
1108       If  the  compile context argument ccontext is NULL, memory for the com‐
1109       piled pattern  is  obtained  by  calling  malloc().  Otherwise,  it  is
1110       obtained  from  the  same memory function that was used for the compile
1111       context. The caller must free the memory by  calling  pcre2_code_free()
1112       when  it  is  no  longer needed.  If pcre2_code_free() is called with a
1113       NULL argument, it returns immediately, without doing anything.
1114
1115       The function pcre2_code_copy() makes a copy of the compiled code in new
1116       memory,  using  the same memory allocator as was used for the original.
1117       However, if the code has  been  processed  by  the  JIT  compiler  (see
1118       below),  the  JIT information cannot be copied (because it is position-
1119       dependent).  The new copy can initially be used only for non-JIT match‐
1120       ing,  though  it  can  be passed to pcre2_jit_compile() if required. If
1121       pcre2_code_copy() is called with a NULL argument, it returns NULL.
1122
1123       The pcre2_code_copy() function provides a way for individual threads in
1124       a  multithreaded  application  to acquire a private copy of shared com‐
1125       piled code.  However, it does not make a copy of the  character  tables
1126       used  by  the compiled pattern; the new pattern code points to the same
1127       tables as the original code.  (See "Locale Support" below  for  details
1128       of  these  character  tables.) In many applications the same tables are
1129       used throughout, so this behaviour is appropriate. Nevertheless,  there
1130       are occasions when a copy of a compiled pattern and the relevant tables
1131       are needed. The pcre2_code_copy_with_tables() provides  this  facility.
1132       Copies  of  both  the  code  and the tables are made, with the new code
1133       pointing to the new tables. The memory for the new tables is  automati‐
1134       cally  freed  when  pcre2_code_free() is called for the new copy of the
1135       compiled code. If pcre2_code_copy_with_tables() is called with  a  NULL
1136       argument, it returns NULL.
1137
1138       NOTE:  When  one  of  the matching functions is called, pointers to the
1139       compiled pattern and the subject string are set in the match data block
1140       so  that  they  can be referenced by the substring extraction functions
1141       after a successful match.  After running a match, you must not  free  a
1142       compiled  pattern or a subject string until after all operations on the
1143       match data block have taken place, unless, in the case of  the  subject
1144       string,  you  have used the PCRE2_COPY_MATCHED_SUBJECT option, which is
1145       described in the  section  entitled  "Option  bits  for  pcre2_match()"
1146       below.
1147
1148       The  options argument for pcre2_compile() contains various bit settings
1149       that affect the compilation. It should be zero  if  none  of  them  are
1150       required.  The  available options are described below. Some of them (in
1151       particular, those that are compatible with Perl,  but  some  others  as
1152       well)  can  also  be  set  and  unset  from within the pattern (see the
1153       detailed description in the pcre2pattern documentation).
1154
1155       For those options that can be different in different parts of the  pat‐
1156       tern,  the contents of the options argument specifies their settings at
1157       the start of compilation. The  PCRE2_ANCHORED,  PCRE2_ENDANCHORED,  and
1158       PCRE2_NO_UTF_CHECK  options  can be set at the time of matching as well
1159       as at compile time.
1160
1161       Some additional  options  and  less  frequently  required  compile-time
1162       parameters (for example, the newline setting) can be provided in a com‐
1163       pile context (as described above).
1164
1165       If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme‐
1166       diately.  Otherwise,  the  variables to which these point are set to an
1167       error code and an offset (number of code  units)  within  the  pattern,
1168       respectively,  when  pcre2_compile() returns NULL because a compilation
1169       error has occurred. The values are not defined when compilation is suc‐
1170       cessful and pcre2_compile() returns a non-NULL value.
1171
1172       There  are  nearly  100  positive  error codes that pcre2_compile() may
1173       return if it finds an error in the pattern. There are also  some  nega‐
1174       tive  error  codes  that are used for invalid UTF strings when validity
1175       checking is in force. These are the same as given by pcre2_match()  and
1176       pcre2_dfa_match(), and are described in the pcre2unicode documentation.
1177       There is no  separate  documentation  for  the  positive  error  codes,
1178       because  the  textual  error  messages that are obtained by calling the
1179       pcre2_get_error_message() function (see "Obtaining a textual error mes‐
1180       sage"  below)  should  be  self-explanatory.  Macro names starting with
1181       PCRE2_ERROR_ are defined for both positive and negative error codes  in
1182       pcre2.h.
1183
1184       The value returned in erroroffset is an indication of where in the pat‐
1185       tern the error occurred. It is not necessarily the  furthest  point  in
1186       the  pattern  that  was  read. For example, after the error "lookbehind
1187       assertion is not fixed length", the error offset points to the start of
1188       the  failing assertion. For an invalid UTF-8 or UTF-16 string, the off‐
1189       set is that of the first code unit of the failing character.
1190
1191       Some errors are not detected until the whole pattern has been  scanned;
1192       in  these  cases,  the offset passed back is the length of the pattern.
1193       Note that the offset is in code units, not characters, even  in  a  UTF
1194       mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char‐
1195       acter.
1196
1197       This code fragment shows a typical straightforward call  to  pcre2_com‐
1198       pile():
1199
1200         pcre2_code *re;
1201         PCRE2_SIZE erroffset;
1202         int errorcode;
1203         re = pcre2_compile(
1204           "^A.*Z",                /* the pattern */
1205           PCRE2_ZERO_TERMINATED,  /* the pattern is zero-terminated */
1206           0,                      /* default options */
1207           &errorcode,             /* for error code */
1208           &erroffset,             /* for error offset */
1209           NULL);                  /* no compile context */
1210
1211
1212   Main compile options
1213
1214       The  following  names for option bits are defined in the pcre2.h header
1215       file:
1216
1217         PCRE2_ANCHORED
1218
1219       If this bit is set, the pattern is forced to be "anchored", that is, it
1220       is  constrained to match only at the first matching point in the string
1221       that is being searched (the "subject string"). This effect can also  be
1222       achieved  by appropriate constructs in the pattern itself, which is the
1223       only way to do it in Perl.
1224
1225         PCRE2_ALLOW_EMPTY_CLASS
1226
1227       By default, for compatibility with Perl, a closing square bracket  that
1228       immediately  follows  an opening one is treated as a data character for
1229       the class. When  PCRE2_ALLOW_EMPTY_CLASS  is  set,  it  terminates  the
1230       class, which therefore contains no characters and so can never match.
1231
1232         PCRE2_ALT_BSUX
1233
1234       This  option  request  alternative  handling of three escape sequences,
1235       which makes PCRE2's behaviour more like  ECMAscript  (aka  JavaScript).
1236       When it is set:
1237
1238       (1) \U matches an upper case "U" character; by default \U causes a com‐
1239       pile time error (Perl uses \U to upper case subsequent characters).
1240
1241       (2) \u matches a lower case "u" character unless it is followed by four
1242       hexadecimal  digits,  in  which case the hexadecimal number defines the
1243       code point to match. By default, \u causes a compile time  error  (Perl
1244       uses it to upper case the following character).
1245
1246       (3)  \x matches a lower case "x" character unless it is followed by two
1247       hexadecimal digits, in which case the hexadecimal  number  defines  the
1248       code  point  to  match. By default, as in Perl, a hexadecimal number is
1249       always expected after \x, but it may have zero, one, or two digits (so,
1250       for example, \xz matches a binary zero character followed by z).
1251
1252       ECMAscript 6 added additional functionality to \u. This can be accessed
1253       using  the  PCRE2_EXTRA_ALT_BSUX  extra  option  (see  "Extra   compile
1254       options"  below).   Note  that this alternative escape handling applies
1255       only to patterns. Neither of these options affects  the  processing  of
1256       replacement strings passed to pcre2_substitute().
1257
1258         PCRE2_ALT_CIRCUMFLEX
1259
1260       In  multiline  mode  (when  PCRE2_MULTILINE  is  set),  the  circumflex
1261       metacharacter matches at the start of the subject (unless  PCRE2_NOTBOL
1262       is  set),  and  also  after  any internal newline. However, it does not
1263       match after a newline at the end of the subject, for compatibility with
1264       Perl.  If  you want a multiline circumflex also to match after a termi‐
1265       nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
1266
1267         PCRE2_ALT_VERBNAMES
1268
1269       By default, for compatibility with Perl, the name in any verb  sequence
1270       such  as  (*MARK:NAME)  is  any  sequence  of  characters that does not
1271       include a closing parenthesis. The name is not processed  in  any  way,
1272       and  it  is  not possible to include a closing parenthesis in the name.
1273       However, if the PCRE2_ALT_VERBNAMES option  is  set,  normal  backslash
1274       processing  is  applied  to  verb  names  and only an unescaped closing
1275       parenthesis terminates the name. A closing parenthesis can be  included
1276       in  a  name either as \) or between \Q and \E. If the PCRE2_EXTENDED or
1277       PCRE2_EXTENDED_MORE option is set with  PCRE2_ALT_VERBNAMES,  unescaped
1278       whitespace  in  verb  names  is  skipped and #-comments are recognized,
1279       exactly as in the rest of the pattern.
1280
1281         PCRE2_AUTO_CALLOUT
1282
1283       If this bit  is  set,  pcre2_compile()  automatically  inserts  callout
1284       items,  all  with  number 255, before each pattern item, except immedi‐
1285       ately before or after an explicit callout in the pattern.  For  discus‐
1286       sion of the callout facility, see the pcre2callout documentation.
1287
1288         PCRE2_CASELESS
1289
1290       If  this  bit is set, letters in the pattern match both upper and lower
1291       case letters in the subject. It is equivalent to Perl's /i option,  and
1292       it  can be changed within a pattern by a (?i) option setting. If either
1293       PCRE2_UTF or PCRE2_UCP is set, Unicode  properties  are  used  for  all
1294       characters  with more than one other case, and for all characters whose
1295       code points are greater than U+007F. Note  that  there  are  two  ASCII
1296       characters, K and S, that, in addition to their lower case ASCII equiv‐
1297       alents, are case-equivalent with U+212A (Kelvin sign) and U+017F  (long
1298       S)  respectively. For lower valued characters with only one other case,
1299       a lookup table is used for speed. When neither PCRE2_UTF nor  PCRE2_UCP
1300       is  set,  a lookup table is used for all code points less than 256, and
1301       higher code points (available  only  in  16-bit  or  32-bit  mode)  are
1302       treated as not having another case.
1303
1304         PCRE2_DOLLAR_ENDONLY
1305
1306       If  this bit is set, a dollar metacharacter in the pattern matches only
1307       at the end of the subject string. Without this option,  a  dollar  also
1308       matches  immediately before a newline at the end of the string (but not
1309       before any other newlines). The PCRE2_DOLLAR_ENDONLY option is  ignored
1310       if  PCRE2_MULTILINE  is  set.  There is no equivalent to this option in
1311       Perl, and no way to set it within a pattern.
1312
1313         PCRE2_DOTALL
1314
1315       If this bit is set, a dot metacharacter  in  the  pattern  matches  any
1316       character,  including  one  that  indicates a newline. However, it only
1317       ever matches one character, even if newlines are coded as CRLF. Without
1318       this option, a dot does not match when the current position in the sub‐
1319       ject is at a newline. This option is equivalent to  Perl's  /s  option,
1320       and it can be changed within a pattern by a (?s) option setting. A neg‐
1321       ative class such as [^a] always matches newline characters, and the  \N
1322       escape  sequence always matches a non-newline character, independent of
1323       the setting of PCRE2_DOTALL.
1324
1325         PCRE2_DUPNAMES
1326
1327       If this bit is set, names used to identify capture groups need  not  be
1328       unique.   This  can  be helpful for certain types of pattern when it is
1329       known that only one instance of the named group can  ever  be  matched.
1330       There  are  more  details  of  named capture groups below; see also the
1331       pcre2pattern documentation.
1332
1333         PCRE2_ENDANCHORED
1334
1335       If this bit is set, the end of any pattern match must be right  at  the
1336       end of the string being searched (the "subject string"). If the pattern
1337       match succeeds by reaching (*ACCEPT), but does not reach the end of the
1338       subject,  the match fails at the current starting point. For unanchored
1339       patterns, a new match is then tried at the next  starting  point.  How‐
1340       ever, if the match succeeds by reaching the end of the pattern, but not
1341       the end of the subject, backtracking occurs and  an  alternative  match
1342       may be found. Consider these two patterns:
1343
1344         .(*ACCEPT)|..
1345         .|..
1346
1347       If  matched against "abc" with PCRE2_ENDANCHORED set, the first matches
1348       "c" whereas the second matches "bc". The  effect  of  PCRE2_ENDANCHORED
1349       can  also  be achieved by appropriate constructs in the pattern itself,
1350       which is the only way to do it in Perl.
1351
1352       For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only
1353       to  the  first  (that  is,  the longest) matched string. Other parallel
1354       matches, which are necessarily substrings of the first one, must  obvi‐
1355       ously end before the end of the subject.
1356
1357         PCRE2_EXTENDED
1358
1359       If  this  bit  is  set,  most white space characters in the pattern are
1360       totally ignored except when escaped or inside a character  class.  How‐
1361       ever,  white  space  is  not  allowed within sequences such as (?> that
1362       introduce various parenthesized groups, nor  within  numerical  quanti‐
1363       fiers such as {1,3}. Ignorable white space is permitted between an item
1364       and a following quantifier and between a quantifier and a  following  +
1365       that  indicates  possessiveness. PCRE2_EXTENDED is equivalent to Perl's
1366       /x option, and it can be changed within a pattern by a (?x) option set‐
1367       ting.
1368
1369       When  PCRE2  is compiled without Unicode support, PCRE2_EXTENDED recog‐
1370       nizes as white space only those characters with code points  less  than
1371       256 that are flagged as white space in its low-character table. The ta‐
1372       ble is normally created by pcre2_maketables(), which uses the isspace()
1373       function  to identify space characters. In most ASCII environments, the
1374       relevant characters are those with code  points  0x0009  (tab),  0x000A
1375       (linefeed),  0x000B (vertical tab), 0x000C (formfeed), 0x000D (carriage
1376       return), and 0x0020 (space).
1377
1378       When PCRE2 is compiled with Unicode support, in addition to these char‐
1379       acters,  five  more Unicode "Pattern White Space" characters are recog‐
1380       nized by PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to-
1381       right  mark), U+200F (right-to-left mark), U+2028 (line separator), and
1382       U+2029 (paragraph separator). This set of characters  is  the  same  as
1383       recognized  by  Perl's /x option. Note that the horizontal and vertical
1384       space characters that are matched by the \h and \v escapes in  patterns
1385       are a much bigger set.
1386
1387       As  well as ignoring most white space, PCRE2_EXTENDED also causes char‐
1388       acters between an unescaped # outside a character class  and  the  next
1389       newline,  inclusive,  to be ignored, which makes it possible to include
1390       comments inside complicated patterns. Note that the end of this type of
1391       comment  is a literal newline sequence in the pattern; escape sequences
1392       that happen to represent a newline do not count.
1393
1394       Which characters are interpreted as newlines can be specified by a set‐
1395       ting  in  the compile context that is passed to pcre2_compile() or by a
1396       special sequence at the start of the pattern, as described in the  sec‐
1397       tion  entitled "Newline conventions" in the pcre2pattern documentation.
1398       A default is defined when PCRE2 is built.
1399
1400         PCRE2_EXTENDED_MORE
1401
1402       This option  has  the  effect  of  PCRE2_EXTENDED,  but,  in  addition,
1403       unescaped  space  and  horizontal  tab  characters are ignored inside a
1404       character class. Note: only these two characters are ignored,  not  the
1405       full  set  of pattern white space characters that are ignored outside a
1406       character  class.  PCRE2_EXTENDED_MORE  is  equivalent  to  Perl's  /xx
1407       option,  and  it can be changed within a pattern by a (?xx) option set‐
1408       ting.
1409
1410         PCRE2_FIRSTLINE
1411
1412       If this option is set, the start of an unanchored pattern match must be
1413       before  or  at  the  first  newline in the subject string following the
1414       start of matching, though the matched text may continue over  the  new‐
1415       line. If startoffset is non-zero, the limiting newline is not necessar‐
1416       ily the first newline in the  subject.  For  example,  if  the  subject
1417       string is "abc\nxyz" (where \n represents a single-character newline) a
1418       pattern match for "yz" succeeds with PCRE2_FIRSTLINE if startoffset  is
1419       greater  than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
1420       general limiting facility. If PCRE2_FIRSTLINE is  set  with  an  offset
1421       limit,  a match must occur in the first line and also within the offset
1422       limit. In other words, whichever limit comes first is used.
1423
1424         PCRE2_LITERAL
1425
1426       If this option is set, all meta-characters in the pattern are disabled,
1427       and  it is treated as a literal string. Matching literal strings with a
1428       regular expression engine is not the most efficient way of doing it. If
1429       you  are  doing  a  lot of literal matching and are worried about effi‐
1430       ciency, you should consider using other approaches. The only other main
1431       options  that  are  allowed  with  PCRE2_LITERAL  are:  PCRE2_ANCHORED,
1432       PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, PCRE2_CASELESS, PCRE2_FIRSTLINE,
1433       PCRE2_MATCH_INVALID_UTF,  PCRE2_NO_START_OPTIMIZE,  PCRE2_NO_UTF_CHECK,
1434       PCRE2_UTF,    and    PCRE2_USE_OFFSET_LIMIT.    The    extra    options
1435       PCRE2_EXTRA_MATCH_LINE  and  PCRE2_EXTRA_MATCH_WORD are also supported.
1436       Any other options cause an error.
1437
1438         PCRE2_MATCH_INVALID_UTF
1439
1440       This option forces PCRE2_UTF (see below) and also enables  support  for
1441       matching  by  pcre2_match() in subject strings that contain invalid UTF
1442       sequences.  This facility  is  not  supported  for  DFA  matching.  For
1443       details, see the pcre2unicode documentation.
1444
1445         PCRE2_MATCH_UNSET_BACKREF
1446
1447       If  this  option  is  set,  a  backreference  to an unset capture group
1448       matches an empty string (by default this causes  the  current  matching
1449       alternative  to  fail).   A  pattern such as (\1)(a) succeeds when this
1450       option is set (assuming it can find an "a" in the subject), whereas  it
1451       fails  by  default,  for  Perl compatibility. Setting this option makes
1452       PCRE2 behave more like ECMAscript (aka JavaScript).
1453
1454         PCRE2_MULTILINE
1455
1456       By default, for the purposes of matching "start of line"  and  "end  of
1457       line",  PCRE2  treats the subject string as consisting of a single line
1458       of characters, even if it actually contains  newlines.  The  "start  of
1459       line"  metacharacter  (^)  matches only at the start of the string, and
1460       the "end of line" metacharacter ($) matches only  at  the  end  of  the
1461       string,  or  before  a  terminating  newline  (except  when  PCRE2_DOL‐
1462       LAR_ENDONLY is set). Note, however, that unless  PCRE2_DOTALL  is  set,
1463       the "any character" metacharacter (.) does not match at a newline. This
1464       behaviour (for ^, $, and dot) is the same as Perl.
1465
1466       When PCRE2_MULTILINE it is set, the "start of line" and "end  of  line"
1467       constructs  match  immediately following or immediately before internal
1468       newlines in the subject string, respectively, as well as  at  the  very
1469       start  and  end.  This is equivalent to Perl's /m option, and it can be
1470       changed within a pattern by a (?m) option setting. Note that the "start
1471       of line" metacharacter does not match after a newline at the end of the
1472       subject, for compatibility with Perl.  However, you can change this  by
1473       setting  the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a
1474       subject string, or no occurrences of ^  or  $  in  a  pattern,  setting
1475       PCRE2_MULTILINE has no effect.
1476
1477         PCRE2_NEVER_BACKSLASH_C
1478
1479       This  option  locks out the use of \C in the pattern that is being com‐
1480       piled.  This escape can  cause  unpredictable  behaviour  in  UTF-8  or
1481       UTF-16  modes,  because  it may leave the current matching point in the
1482       middle of a multi-code-unit character. This option  may  be  useful  in
1483       applications  that  process  patterns  from external sources. Note that
1484       there is also a build-time option that permanently locks out the use of
1485       \C.
1486
1487         PCRE2_NEVER_UCP
1488
1489       This  option  locks  out the use of Unicode properties for handling \B,
1490       \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
1491       described  for  the  PCRE2_UCP option below. In particular, it prevents
1492       the creator of the pattern from enabling this facility by starting  the
1493       pattern  with  (*UCP).  This  option may be useful in applications that
1494       process patterns from external sources. The option combination PCRE_UCP
1495       and PCRE_NEVER_UCP causes an error.
1496
1497         PCRE2_NEVER_UTF
1498
1499       This  option  locks out interpretation of the pattern as UTF-8, UTF-16,
1500       or UTF-32, depending on which library is in use. In particular, it pre‐
1501       vents  the  creator of the pattern from switching to UTF interpretation
1502       by starting the pattern with (*UTF).  This  option  may  be  useful  in
1503       applications  that process patterns from external sources. The combina‐
1504       tion of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
1505
1506         PCRE2_NO_AUTO_CAPTURE
1507
1508       If this option is set, it disables the use of numbered capturing paren‐
1509       theses  in the pattern. Any opening parenthesis that is not followed by
1510       ? behaves as if it were followed by ?: but named parentheses can  still
1511       be used for capturing (and they acquire numbers in the usual way). This
1512       is the same as Perl's /n option.  Note that, when this option  is  set,
1513       references  to  capture  groups (backreferences or recursion/subroutine
1514       calls) may only refer to named groups, though the reference can  be  by
1515       name or by number.
1516
1517         PCRE2_NO_AUTO_POSSESS
1518
1519       If this option is set, it disables "auto-possessification", which is an
1520       optimization that, for example, turns a+b into a++b in order  to  avoid
1521       backtracks  into  a+ that can never be successful. However, if callouts
1522       are in use, auto-possessification means that some  callouts  are  never
1523       taken. You can set this option if you want the matching functions to do
1524       a full unoptimized search and run all the callouts, but  it  is  mainly
1525       provided for testing purposes.
1526
1527         PCRE2_NO_DOTSTAR_ANCHOR
1528
1529       If this option is set, it disables an optimization that is applied when
1530       .* is the first significant item in a top-level branch  of  a  pattern,
1531       and  all  the  other branches also start with .* or with \A or \G or ^.
1532       The optimization is automatically disabled for .* if it  is  inside  an
1533       atomic group or a capture group that is the subject of a backreference,
1534       or if the pattern contains (*PRUNE) or (*SKIP). When  the  optimization
1535       is   not   disabled,  such  a  pattern  is  automatically  anchored  if
1536       PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set
1537       for  any  ^ items. Otherwise, the fact that any match must start either
1538       at the start of the subject or following a newline is remembered.  Like
1539       other optimizations, this can cause callouts to be skipped.
1540
1541         PCRE2_NO_START_OPTIMIZE
1542
1543       This  is  an  option whose main effect is at matching time. It does not
1544       change what pcre2_compile() generates, but it does affect the output of
1545       the JIT compiler.
1546
1547       There  are  a  number of optimizations that may occur at the start of a
1548       match, in order to speed up the process. For example, if  it  is  known
1549       that  an  unanchored  match must start with a specific code unit value,
1550       the matching code searches the subject for that value, and fails  imme‐
1551       diately  if it cannot find it, without actually running the main match‐
1552       ing function. This means that a special item such as (*COMMIT)  at  the
1553       start  of  a  pattern is not considered until after a suitable starting
1554       point for the match has been found.  Also,  when  callouts  or  (*MARK)
1555       items  are  in use, these "start-up" optimizations can cause them to be
1556       skipped if the pattern is never actually used. The  start-up  optimiza‐
1557       tions  are  in effect a pre-scan of the subject that takes place before
1558       the pattern is run.
1559
1560       The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
1561       possibly  causing  performance  to  suffer,  but ensuring that in cases
1562       where the result is "no match", the callouts do occur, and  that  items
1563       such as (*COMMIT) and (*MARK) are considered at every possible starting
1564       position in the subject string.
1565
1566       Setting PCRE2_NO_START_OPTIMIZE may change the outcome  of  a  matching
1567       operation.  Consider the pattern
1568
1569         (*COMMIT)ABC
1570
1571       When  this  is compiled, PCRE2 records the fact that a match must start
1572       with the character "A". Suppose the subject  string  is  "DEFABC".  The
1573       start-up  optimization  scans along the subject, finds "A" and runs the
1574       first match attempt from there. The (*COMMIT) item means that the  pat‐
1575       tern  must  match the current starting position, which in this case, it
1576       does. However, if the same match is  run  with  PCRE2_NO_START_OPTIMIZE
1577       set,  the  initial  scan  along the subject string does not happen. The
1578       first match attempt is run starting  from  "D"  and  when  this  fails,
1579       (*COMMIT)  prevents  any  further  matches  being tried, so the overall
1580       result is "no match".
1581
1582       As another start-up optimization makes use of a minimum  length  for  a
1583       matching subject, which is recorded when possible. Consider the pattern
1584
1585         (*MARK:1)B(*MARK:2)(X|Y)
1586
1587       The  minimum  length  for  a match is two characters. If the subject is
1588       "XXBB", the "starting character" optimization skips "XX", then tries to
1589       match  "BB", which is long enough. In the process, (*MARK:2) is encoun‐
1590       tered and remembered. When the match attempt fails,  the  next  "B"  is
1591       found,  but  there  is  only  one  character left, so there are no more
1592       attempts, and "no match" is returned with the "last mark seen"  set  to
1593       "2".  If  NO_START_OPTIMIZE is set, however, matches are tried at every
1594       possible starting position, including at the end of the subject,  where
1595       (*MARK:1)  is encountered, but there is no "B", so the "last mark seen"
1596       that is returned is "1". In this case, the optimizations do not  affect
1597       the overall match result, which is still "no match", but they do affect
1598       the auxiliary information that is returned.
1599
1600         PCRE2_NO_UTF_CHECK
1601
1602       When PCRE2_UTF is set, the validity of the pattern as a UTF  string  is
1603       automatically  checked.  There  are  discussions  about the validity of
1604       UTF-8 strings, UTF-16 strings, and UTF-32 strings in  the  pcre2unicode
1605       document.  If an invalid UTF sequence is found, pcre2_compile() returns
1606       a negative error code.
1607
1608       If you know that your pattern is a valid UTF string, and  you  want  to
1609       skip   this   check   for   performance   reasons,   you  can  set  the
1610       PCRE2_NO_UTF_CHECK option. When it is set, the  effect  of  passing  an
1611       invalid UTF string as a pattern is undefined. It may cause your program
1612       to crash or loop.
1613
1614       Note  that  this  option  can  also  be  passed  to  pcre2_match()  and
1615       pcre_dfa_match(),  to  suppress  UTF  validity  checking of the subject
1616       string.
1617
1618       Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis‐
1619       able  the error that is given if an escape sequence for an invalid Uni‐
1620       code code point is encountered in the pattern. In particular,  the  so-
1621       called  "surrogate"  code points (0xd800 to 0xdfff) are invalid. If you
1622       want to allow escape  sequences  such  as  \x{d800}  you  can  set  the
1623       PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  extra  option, as described in the
1624       section entitled "Extra compile options" below.  However, this is  pos‐
1625       sible only in UTF-8 and UTF-32 modes, because these values are not rep‐
1626       resentable in UTF-16.
1627
1628         PCRE2_UCP
1629
1630       This option has two effects. Firstly, it change the way PCRE2 processes
1631       \B,  \b,  \D,  \d,  \S,  \s,  \W,  \w,  and some of the POSIX character
1632       classes. By default, only  ASCII  characters  are  recognized,  but  if
1633       PCRE2_UCP is set, Unicode properties are used instead to classify char‐
1634       acters. More details are given in  the  section  on  generic  character
1635       types  in  the pcre2pattern page. If you set PCRE2_UCP, matching one of
1636       the items it affects takes much longer.
1637
1638       The second effect of PCRE2_UCP is to force the use of  Unicode  proper‐
1639       ties  for  upper/lower casing operations on characters with code points
1640       greater than 127, even when PCRE2_UTF is not set. This makes it  possi‐
1641       ble,  for  example,  to  process strings in the 16-bit UCS-2 code. This
1642       option is available only if PCRE2 has been compiled with  Unicode  sup‐
1643       port (which is the default).
1644
1645         PCRE2_UNGREEDY
1646
1647       This  option  inverts  the "greediness" of the quantifiers so that they
1648       are not greedy by default, but become greedy if followed by "?". It  is
1649       not  compatible  with Perl. It can also be set by a (?U) option setting
1650       within the pattern.
1651
1652         PCRE2_USE_OFFSET_LIMIT
1653
1654       This option must be set for pcre2_compile() if pcre2_set_offset_limit()
1655       is  going  to be used to set a non-default offset limit in a match con‐
1656       text for matches that use this pattern. An error  is  generated  if  an
1657       offset  limit  is  set  without  this option. For more details, see the
1658       description of pcre2_set_offset_limit() in the section  that  describes
1659       match contexts. See also the PCRE2_FIRSTLINE option above.
1660
1661         PCRE2_UTF
1662
1663       This  option  causes  PCRE2  to regard both the pattern and the subject
1664       strings that are subsequently processed as strings  of  UTF  characters
1665       instead  of  single-code-unit  strings.  It  is available when PCRE2 is
1666       built to include Unicode support (which is  the  default).  If  Unicode
1667       support  is  not  available,  the use of this option provokes an error.
1668       Details of how PCRE2_UTF changes the behaviour of PCRE2  are  given  in
1669       the  pcre2unicode  page.  In  particular,  note that it changes the way
1670       PCRE2_CASELESS handles characters with code points greater than 127.
1671
1672   Extra compile options
1673
1674       The option bits that can be set in a compile  context  by  calling  the
1675       pcre2_set_compile_extra_options() function are as follows:
1676
1677         PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
1678
1679       This  option  applies when compiling a pattern in UTF-8 or UTF-32 mode.
1680       It is forbidden in UTF-16 mode, and ignored in non-UTF  modes.  Unicode
1681       "surrogate" code points in the range 0xd800 to 0xdfff are used in pairs
1682       in UTF-16 to encode code points with values in  the  range  0x10000  to
1683       0x10ffff.  The  surrogates  cannot  therefore be represented in UTF-16.
1684       They can be represented in UTF-8 and UTF-32, but are defined as invalid
1685       code  points,  and  cause  errors  if  encountered in a UTF-8 or UTF-32
1686       string that is being checked for validity by PCRE2.
1687
1688       These values also cause errors if encountered in escape sequences  such
1689       as \x{d912} within a pattern. However, it seems that some applications,
1690       when using PCRE2 to check for unwanted  characters  in  UTF-8  strings,
1691       explicitly   test  for  the  surrogates  using  escape  sequences.  The
1692       PCRE2_NO_UTF_CHECK option does  not  disable  the  error  that  occurs,
1693       because  it applies only to the testing of input strings for UTF valid‐
1694       ity.
1695
1696       If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set,  surro‐
1697       gate  code  point values in UTF-8 and UTF-32 patterns no longer provoke
1698       errors and are incorporated in the compiled pattern. However, they  can
1699       only  match  subject characters if the matching function is called with
1700       PCRE2_NO_UTF_CHECK set.
1701
1702         PCRE2_EXTRA_ALT_BSUX
1703
1704       The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u,  and
1705       \x  in  the way that ECMAscript (aka JavaScript) does. Additional func‐
1706       tionality was defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has
1707       the  effect  of PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..}
1708       as a hexadecimal character code, where hhh.. is any number of hexadeci‐
1709       mal digits.
1710
1711         PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
1712
1713       This  is a dangerous option. Use with care. By default, an unrecognized
1714       escape such as \j or a malformed one such as \x{2z} causes  a  compile-
1715       time error when detected by pcre2_compile(). Perl is somewhat inconsis‐
1716       tent in handling such items: for example, \j is treated  as  a  literal
1717       "j",  and non-hexadecimal digits in \x{} are just ignored, though warn‐
1718       ings are given in both cases if Perl's warning switch is enabled.  How‐
1719       ever,  a  malformed  octal  number  after \o{ always causes an error in
1720       Perl.
1721
1722       If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL  extra  option  is  passed  to
1723       pcre2_compile(),  all  unrecognized  or  malformed escape sequences are
1724       treated as single-character escapes. For example, \j is a  literal  "j"
1725       and  \x{2z}  is  treated  as  the  literal string "x{2z}". Setting this
1726       option means that typos in patterns may go undetected  and  have  unex‐
1727       pected  results. Also note that a sequence such as [\N{] is interpreted
1728       as a malformed attempt at [\N{...}] and so is treated as  [N{]  whereas
1729       [\N]  gives  an  error  because  an  unqualified  \N  is a valid escape
1730       sequence but is not supported in a character class. To reiterate:  this
1731       is a dangerous option. Use with great care.
1732
1733         PCRE2_EXTRA_ESCAPED_CR_IS_LF
1734
1735       There  are  some  legacy applications where the escape sequence \r in a
1736       pattern is expected to match a newline. If this option is set, \r in  a
1737       pattern  is  converted to \n so that it matches a LF (linefeed) instead
1738       of a CR (carriage return) character. The option does not affect a  lit‐
1739       eral  CR in the pattern, nor does it affect CR specified as an explicit
1740       code point such as \x{0D}.
1741
1742         PCRE2_EXTRA_MATCH_LINE
1743
1744       This option is provided for use by  the  -x  option  of  pcre2grep.  It
1745       causes  the  pattern  only to match complete lines. This is achieved by
1746       automatically inserting the code for "^(?:" at the start  of  the  com‐
1747       piled  pattern  and ")$" at the end. Thus, when PCRE2_MULTILINE is set,
1748       the matched line may be in the  middle  of  the  subject  string.  This
1749       option can be used with PCRE2_LITERAL.
1750
1751         PCRE2_EXTRA_MATCH_WORD
1752
1753       This  option  is  provided  for  use  by the -w option of pcre2grep. It
1754       causes the pattern only to match strings that have a word  boundary  at
1755       the  start and the end. This is achieved by automatically inserting the
1756       code for "\b(?:" at the start of the compiled pattern and ")\b" at  the
1757       end.  The option may be used with PCRE2_LITERAL. However, it is ignored
1758       if PCRE2_EXTRA_MATCH_LINE is also set.
1759

JUST-IN-TIME (JIT) COMPILATION

1761
1762       int pcre2_jit_compile(pcre2_code *code, uint32_t options);
1763
1764       int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
1765         PCRE2_SIZE length, PCRE2_SIZE startoffset,
1766         uint32_t options, pcre2_match_data *match_data,
1767         pcre2_match_context *mcontext);
1768
1769       void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
1770
1771       pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE startsize,
1772         PCRE2_SIZE maxsize, pcre2_general_context *gcontext);
1773
1774       void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
1775         pcre2_jit_callback callback_function, void *callback_data);
1776
1777       void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
1778
1779       These functions provide support for  JIT  compilation,  which,  if  the
1780       just-in-time  compiler  is available, further processes a compiled pat‐
1781       tern into machine code that executes much faster than the pcre2_match()
1782       interpretive  matching function. Full details are given in the pcre2jit
1783       documentation.
1784
1785       JIT compilation is a heavyweight optimization. It can  take  some  time
1786       for  patterns  to  be analyzed, and for one-off matches and simple pat‐
1787       terns the benefit of faster execution might be offset by a much  slower
1788       compilation  time.  Most (but not all) patterns can be optimized by the
1789       JIT compiler.
1790

LOCALE SUPPORT

1792
1793       const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);
1794
1795       void pcre2_maketables_free(pcre2_general_context *gcontext,
1796         const uint8_t *tables);
1797
1798       PCRE2 handles caseless matching, and determines whether characters  are
1799       letters,  digits, or whatever, by reference to a set of tables, indexed
1800       by character code point. However, this applies only to characters whose
1801       code  points  are  less than 256. By default, higher-valued code points
1802       never match escapes such as \w or \d.
1803
1804       When PCRE2 is built with Unicode support  (the  default),  the  Unicode
1805       properties of all characters can be tested with \p and \P, or, alterna‐
1806       tively, the PCRE2_UCP option can be set when  a  pattern  is  compiled;
1807       this  causes  \w and friends to use Unicode property support instead of
1808       the built-in tables.  PCRE2_UCP also causes upper/lower  casing  opera‐
1809       tions  on  characters  with code points greater than 127 to use Unicode
1810       properties. These effects apply even when PCRE2_UTF is not set.
1811
1812       The use of locales with Unicode is discouraged.  If  you  are  handling
1813       characters  with  code  points  greater than 127, you should either use
1814       Unicode support, or use locales, but not try to mix the two.
1815
1816       PCRE2 contains a built-in set of character  tables  that  are  used  by
1817       default.   These  are  sufficient  for many applications. Normally, the
1818       internal tables recognize only ASCII characters. However, when PCRE2 is
1819       built, it is possible to cause the internal tables to be rebuilt in the
1820       default "C" locale of the local system, which may cause them to be dif‐
1821       ferent.
1822
1823       The  built-in tables can be overridden by tables supplied by the appli‐
1824       cation that calls PCRE2. These may be created  in  a  different  locale
1825       from  the  default.  As more and more applications change to using Uni‐
1826       code, the need for this locale support is expected to die away.
1827
1828       External tables are built by calling the  pcre2_maketables()  function,
1829       in the relevant locale. The only argument to this function is a general
1830       context, which can be used to pass a custom memory  allocator.  If  the
1831       argument is NULL, the system malloc() is used. The result can be passed
1832       to pcre2_compile() as often as necessary, by creating a compile context
1833       and  calling  pcre2_set_character_tables()  to  set  the tables pointer
1834       therein.
1835
1836       For example, to build and use  tables  that  are  appropriate  for  the
1837       French  locale  (where accented characters with values greater than 127
1838       are treated as letters), the following code could be used:
1839
1840         setlocale(LC_CTYPE, "fr_FR");
1841         tables = pcre2_maketables(NULL);
1842         ccontext = pcre2_compile_context_create(NULL);
1843         pcre2_set_character_tables(ccontext, tables);
1844         re = pcre2_compile(..., ccontext);
1845
1846       The locale name "fr_FR" is used on Linux and other  Unix-like  systems;
1847       if you are using Windows, the name for the French locale is "french".
1848
1849       The pointer that is passed (via the compile context) to pcre2_compile()
1850       is saved with the compiled pattern, and the same tables are used by the
1851       matching  functions.  Thus,  for  any  single  pattern, compilation and
1852       matching both happen in the same locale, but different patterns can  be
1853       processed in different locales.
1854
1855       It  is the caller's responsibility to ensure that the memory containing
1856       the tables remains available while they are still in use. When they are
1857       no  longer  needed, you can discard them using pcre2_maketables_free(),
1858       which should pass as its first parameter the same global  context  that
1859       was used to create the tables.
1860
1861   Saving locale tables
1862
1863       The  tables  described above are just a sequence of binary bytes, which
1864       makes them independent of hardware characteristics such  as  endianness
1865       or  whether  the processor is 32-bit or 64-bit. A copy of the result of
1866       pcre2_maketables() can therefore be saved in a file  or  elsewhere  and
1867       re-used  later, even in a different program or on another computer. The
1868       size of the tables (number  of  bytes)  must  be  obtained  by  calling
1869       pcre2_config()   with  the  PCRE2_CONFIG_TABLES_LENGTH  option  because
1870       pcre2_maketables()  does  not  return  this  value.   Note   that   the
1871       pcre2_dftables program, which is part of the PCRE2 build system, can be
1872       used stand-alone to create a file that contains a set of binary tables.
1873       See the pcre2build documentation for details.
1874

INFORMATION ABOUT A COMPILED PATTERN

1876
1877       int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
1878
1879       The  pcre2_pattern_info()  function returns general information about a
1880       compiled pattern. For information about callouts, see the next section.
1881       The  first  argument  for pcre2_pattern_info() is a pointer to the com‐
1882       piled pattern. The second argument specifies which piece of information
1883       is  required,  and  the  third  argument  is a pointer to a variable to
1884       receive the data. If the third argument is NULL, the first argument  is
1885       ignored,  and  the  function  returns the size in bytes of the variable
1886       that is required for the information requested. Otherwise, the yield of
1887       the function is zero for success, or one of the following negative num‐
1888       bers:
1889
1890         PCRE2_ERROR_NULL           the argument code was NULL
1891         PCRE2_ERROR_BADMAGIC       the "magic number" was not found
1892         PCRE2_ERROR_BADOPTION      the value of what was invalid
1893         PCRE2_ERROR_UNSET          the requested field is not set
1894
1895       The "magic number" is placed at the start of each compiled pattern as a
1896       simple  check  against  passing  an arbitrary memory pointer. Here is a
1897       typical call of pcre2_pattern_info(), to obtain the length of the  com‐
1898       piled pattern:
1899
1900         int rc;
1901         size_t length;
1902         rc = pcre2_pattern_info(
1903           re,               /* result of pcre2_compile() */
1904           PCRE2_INFO_SIZE,  /* what is required */
1905           &length);         /* where to put the data */
1906
1907       The possible values for the second argument are defined in pcre2.h, and
1908       are as follows:
1909
1910         PCRE2_INFO_ALLOPTIONS
1911         PCRE2_INFO_ARGOPTIONS
1912         PCRE2_INFO_EXTRAOPTIONS
1913
1914       Return copies of the pattern's options. The third argument should point
1915       to  a  uint32_t  variable.  PCRE2_INFO_ARGOPTIONS  returns  exactly the
1916       options that were passed to pcre2_compile(), whereas  PCRE2_INFO_ALLOP‐
1917       TIONS  returns  the compile options as modified by any top-level (*XXX)
1918       option settings such as (*UTF) at the  start  of  the  pattern  itself.
1919       PCRE2_INFO_EXTRAOPTIONS  returns the extra options that were set in the
1920       compile context by calling the pcre2_set_compile_extra_options()  func‐
1921       tion.
1922
1923       For   example,   if  the  pattern  /(*UTF)abc/  is  compiled  with  the
1924       PCRE2_EXTENDED  option,  the  result   for   PCRE2_INFO_ALLOPTIONS   is
1925       PCRE2_EXTENDED  and  PCRE2_UTF.   Option settings such as (?i) that can
1926       change within a pattern do not affect the result  of  PCRE2_INFO_ALLOP‐
1927       TIONS, even if they appear right at the start of the pattern. (This was
1928       different in some earlier releases.)
1929
1930       A pattern compiled without PCRE2_ANCHORED is automatically anchored  by
1931       PCRE2 if the first significant item in every top-level branch is one of
1932       the following:
1933
1934         ^     unless PCRE2_MULTILINE is set
1935         \A    always
1936         \G    always
1937         .*    sometimes - see below
1938
1939       When .* is the first significant item, anchoring is possible only  when
1940       all the following are true:
1941
1942         .* is not in an atomic group
1943         .* is not in a capture group that is the subject
1944              of a backreference
1945         PCRE2_DOTALL is in force for .*
1946         Neither (*PRUNE) nor (*SKIP) appears in the pattern
1947         PCRE2_NO_DOTSTAR_ANCHOR is not set
1948
1949       For  patterns  that are auto-anchored, the PCRE2_ANCHORED bit is set in
1950       the options returned for PCRE2_INFO_ALLOPTIONS.
1951
1952         PCRE2_INFO_BACKREFMAX
1953
1954       Return the number of the highest  backreference  in  the  pattern.  The
1955       third  argument  should  point  to  a  uint32_t variable. Named capture
1956       groups acquire numbers as well as names, and these  count  towards  the
1957       highest  backreference.  Backreferences  such as \4 or \g{12} match the
1958       captured characters of the given group, but in addition, the check that
1959       a capture group is set in a conditional group such as (?(3)a|b) is also
1960       a backreference.  Zero is returned if there are no backreferences.
1961
1962         PCRE2_INFO_BSR
1963
1964       The output is a uint32_t integer whose value indicates  what  character
1965       sequences  the \R escape sequence matches. A value of PCRE2_BSR_UNICODE
1966       means that \R matches any Unicode line  ending  sequence;  a  value  of
1967       PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF.
1968
1969         PCRE2_INFO_CAPTURECOUNT
1970
1971       Return  the  highest  capture  group number in the pattern. In patterns
1972       where (?| is not used, this is also the total number of capture groups.
1973       The third argument should point to a uint32_t variable.
1974
1975         PCRE2_INFO_DEPTHLIMIT
1976
1977       If  the  pattern set a backtracking depth limit by including an item of
1978       the form (*LIMIT_DEPTH=nnnn) at the start, the value is  returned.  The
1979       third argument should point to a uint32_t integer. If no such value has
1980       been  set,  the  call  to  pcre2_pattern_info()   returns   the   error
1981       PCRE2_ERROR_UNSET. Note that this limit will only be used during match‐
1982       ing if it is less than the limit set or defaulted by the caller of  the
1983       match function.
1984
1985         PCRE2_INFO_FIRSTBITMAP
1986
1987       In  the absence of a single first code unit for a non-anchored pattern,
1988       pcre2_compile() may construct a 256-bit table that defines a fixed  set
1989       of  values for the first code unit in any match. For example, a pattern
1990       that starts with [abc] results in a table with  three  bits  set.  When
1991       code  unit  values greater than 255 are supported, the flag bit for 255
1992       means "any code unit of value 255 or above". If such a table  was  con‐
1993       structed,  a pointer to it is returned. Otherwise NULL is returned. The
1994       third argument should point to a const uint8_t * variable.
1995
1996         PCRE2_INFO_FIRSTCODETYPE
1997
1998       Return information about the first code unit of any matched string, for
1999       a  non-anchored  pattern. The third argument should point to a uint32_t
2000       variable. If there is a fixed first value, for example, the letter  "c"
2001       from  a  pattern such as (cat|cow|coyote), 1 is returned, and the value
2002       can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is  no  fixed
2003       first  value,  but it is known that a match can occur only at the start
2004       of the subject or following a newline in the subject,  2  is  returned.
2005       Otherwise, and for anchored patterns, 0 is returned.
2006
2007         PCRE2_INFO_FIRSTCODEUNIT
2008
2009       Return  the  value  of  the first code unit of any matched string for a
2010       pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise  return  0.
2011       The  third  argument  should point to a uint32_t variable. In the 8-bit
2012       library, the value is always less than 256. In the 16-bit  library  the
2013       value  can  be  up  to 0xffff. In the 32-bit library in UTF-32 mode the
2014       value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
2015       mode.
2016
2017         PCRE2_INFO_FRAMESIZE
2018
2019       Return the size (in bytes) of the data frames that are used to remember
2020       backtracking positions when the pattern is processed  by  pcre2_match()
2021       without  the  use  of  JIT. The third argument should point to a size_t
2022       variable. The frame size depends on the number of capturing parentheses
2023       in the pattern. Each additional capture group adds two PCRE2_SIZE vari‐
2024       ables.
2025
2026         PCRE2_INFO_HASBACKSLASHC
2027
2028       Return 1 if the pattern contains any instances of \C, otherwise 0.  The
2029       third argument should point to a uint32_t variable.
2030
2031         PCRE2_INFO_HASCRORLF
2032
2033       Return  1  if  the  pattern  contains any explicit matches for CR or LF
2034       characters, otherwise 0. The third argument should point to a  uint32_t
2035       variable.  An explicit match is either a literal CR or LF character, or
2036       \r or  \n  or  one  of  the  equivalent  hexadecimal  or  octal  escape
2037       sequences.
2038
2039         PCRE2_INFO_HEAPLIMIT
2040
2041       If the pattern set a heap memory limit by including an item of the form
2042       (*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argu‐
2043       ment should point to a uint32_t integer. If no such value has been set,
2044       the call to pcre2_pattern_info() returns the  error  PCRE2_ERROR_UNSET.
2045       Note  that  this  limit will only be used during matching if it is less
2046       than the limit set or defaulted by the caller of the match function.
2047
2048         PCRE2_INFO_JCHANGED
2049
2050       Return 1 if the (?J) or (?-J) option setting is used  in  the  pattern,
2051       otherwise  0.  The  third argument should point to a uint32_t variable.
2052       (?J) and (?-J) set and unset the local PCRE2_DUPNAMES  option,  respec‐
2053       tively.
2054
2055         PCRE2_INFO_JITSIZE
2056
2057       If  the  compiled  pattern was successfully processed by pcre2_jit_com‐
2058       pile(), return the size of the  JIT  compiled  code,  otherwise  return
2059       zero. The third argument should point to a size_t variable.
2060
2061         PCRE2_INFO_LASTCODETYPE
2062
2063       Returns  1 if there is a rightmost literal code unit that must exist in
2064       any matched string, other than at its start. The third argument  should
2065       point to a uint32_t variable. If there is no such value, 0 is returned.
2066       When 1 is returned, the code unit value itself can be  retrieved  using
2067       PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
2068       recorded only if it follows something of variable length. For  example,
2069       for  the pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned
2070       from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value  is
2071       0.
2072
2073         PCRE2_INFO_LASTCODEUNIT
2074
2075       Return  the value of the rightmost literal code unit that must exist in
2076       any matched string, other than  at  its  start,  for  a  pattern  where
2077       PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argu‐
2078       ment should point to a uint32_t variable.
2079
2080         PCRE2_INFO_MATCHEMPTY
2081
2082       Return 1 if the pattern might match an empty string, otherwise  0.  The
2083       third argument should point to a uint32_t variable. When a pattern con‐
2084       tains recursive subroutine calls it is not always possible to determine
2085       whether  or  not  it  can match an empty string. PCRE2 takes a cautious
2086       approach and returns 1 in such cases.
2087
2088         PCRE2_INFO_MATCHLIMIT
2089
2090       If the pattern set a match limit by  including  an  item  of  the  form
2091       (*LIMIT_MATCH=nnnn)  at  the  start,  the  value is returned. The third
2092       argument should point to a uint32_t integer. If no such value has  been
2093       set,    the    call   to   pcre2_pattern_info()   returns   the   error
2094       PCRE2_ERROR_UNSET. Note that this limit will only be used during match‐
2095       ing  if it is less than the limit set or defaulted by the caller of the
2096       match function.
2097
2098         PCRE2_INFO_MAXLOOKBEHIND
2099
2100       A lookbehind assertion moves back a certain number of  characters  (not
2101       code  units)  when  it  starts  to  process  each of its branches. This
2102       request returns the largest of these backward moves. The third argument
2103       should  point  to  a  uint32_t integer. The simple assertions \b and \B
2104       require a one-character lookbehind and  cause  PCRE2_INFO_MAXLOOKBEHIND
2105       to return 1 in the absence of anything longer. \A also registers a one-
2106       character lookbehind, though it does not actually inspect the  previous
2107       character.
2108
2109       Note that this information is useful for multi-segment matching only if
2110       the pattern contains no nested lookbehinds. For  example,  the  pattern
2111       (?<=a(?<=ba)c)  returns  a maximum lookbehind of 2, but when it is pro‐
2112       cessed, the first lookbehind moves back by two characters, matches  one
2113       character,  then  the  nested lookbehind also moves back by two charac‐
2114       ters. This puts the matching point three characters earlier than it was
2115       at  the  start.   PCRE2_INFO_MAXLOOKBEHIND  is  really only useful as a
2116       debugging tool. See the pcre2partial documentation for a discussion  of
2117       multi-segment matching.
2118
2119         PCRE2_INFO_MINLENGTH
2120
2121       If  a  minimum  length  for  matching subject strings was computed, its
2122       value is returned. Otherwise the returned value is 0. This value is not
2123       computed  when PCRE2_NO_START_OPTIMIZE is set. The value is a number of
2124       characters, which in UTF mode may be different from the number of  code
2125       units.  The  third  argument  should  point to a uint32_t variable. The
2126       value is a lower bound to the length of any matching string. There  may
2127       not  be  any  strings  of that length that do actually match, but every
2128       string that does match is at least that long.
2129
2130         PCRE2_INFO_NAMECOUNT
2131         PCRE2_INFO_NAMEENTRYSIZE
2132         PCRE2_INFO_NAMETABLE
2133
2134       PCRE2 supports the use of named as well as numbered capturing parenthe‐
2135       ses.  The names are just an additional way of identifying the parenthe‐
2136       ses, which still acquire numbers. Several convenience functions such as
2137       pcre2_substring_get_byname()  are provided for extracting captured sub‐
2138       strings by name. It is also possible to extract the data  directly,  by
2139       first  converting  the  name to a number in order to access the correct
2140       pointers in the output vector (described with pcre2_match() below).  To
2141       do  the  conversion,  you  need to use the name-to-number map, which is
2142       described by these three values.
2143
2144       The map consists of a number of  fixed-size  entries.  PCRE2_INFO_NAME‐
2145       COUNT  gives  the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
2146       the size of each entry in code units; both of these return  a  uint32_t
2147       value. The entry size depends on the length of the longest name.
2148
2149       PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
2150       This is a PCRE2_SPTR pointer to a block of code  units.  In  the  8-bit
2151       library,  the  first two bytes of each entry are the number of the cap‐
2152       turing parenthesis, most significant byte first. In the 16-bit library,
2153       the  pointer  points  to 16-bit code units, the first of which contains
2154       the parenthesis number. In the 32-bit library, the  pointer  points  to
2155       32-bit  code units, the first of which contains the parenthesis number.
2156       The rest of the entry is the corresponding name, zero terminated.
2157
2158       The names are in alphabetical order. If (?| is used to create  multiple
2159       capture  groups  with  the  same number, as described in the section on
2160       duplicate group numbers in the pcre2pattern page,  the  groups  may  be
2161       given  the same name, but there is only one entry in the table. Differ‐
2162       ent names for groups of the same number are not permitted.
2163
2164       Duplicate names for capture groups with different numbers  are  permit‐
2165       ted, but only if PCRE2_DUPNAMES is set. They appear in the table in the
2166       order in which they were found in the pattern. In the  absence  of  (?|
2167       this  is  the  order of increasing number; when (?| is used this is not
2168       necessarily the case because later capture groups may have  lower  num‐
2169       bers.
2170
2171       As  a  simple  example of the name/number table, consider the following
2172       pattern after compilation by the 8-bit library  (assume  PCRE2_EXTENDED
2173       is set, so white space - including newlines - is ignored):
2174
2175         (?<date> (?<year>(\d\d)?\d\d) -
2176         (?<month>\d\d) - (?<day>\d\d) )
2177
2178       There are four named capture groups, so the table has four entries, and
2179       each entry in the table is eight bytes long. The table is  as  follows,
2180       with non-printing bytes shows in hexadecimal, and undefined bytes shown
2181       as ??:
2182
2183         00 01 d  a  t  e  00 ??
2184         00 05 d  a  y  00 ?? ??
2185         00 04 m  o  n  t  h  00
2186         00 02 y  e  a  r  00 ??
2187
2188       When writing code to extract data from named capture groups  using  the
2189       name-to-number  map,  remember that the length of the entries is likely
2190       to be different for each compiled pattern.
2191
2192         PCRE2_INFO_NEWLINE
2193
2194       The output is one of the following uint32_t values:
2195
2196         PCRE2_NEWLINE_CR       Carriage return (CR)
2197         PCRE2_NEWLINE_LF       Linefeed (LF)
2198         PCRE2_NEWLINE_CRLF     Carriage return, linefeed (CRLF)
2199         PCRE2_NEWLINE_ANY      Any Unicode line ending
2200         PCRE2_NEWLINE_ANYCRLF  Any of CR, LF, or CRLF
2201         PCRE2_NEWLINE_NUL      The NUL character (binary zero)
2202
2203       This identifies the character sequence that will be recognized as mean‐
2204       ing "newline" while matching.
2205
2206         PCRE2_INFO_SIZE
2207
2208       Return  the  size  of  the  compiled  pattern  in  bytes (for all three
2209       libraries). The third argument should point to a size_t variable.  This
2210       value  includes  the  size  of the general data block that precedes the
2211       code units of the compiled pattern itself. The value that is used  when
2212       pcre2_compile()  is  getting memory in which to place the compiled pat‐
2213       tern may be slightly larger than the value  returned  by  this  option,
2214       because  there are cases where the code that calculates the size has to
2215       over-estimate. Processing a pattern with  the  JIT  compiler  does  not
2216       alter the value returned by this option.
2217

INFORMATION ABOUT A PATTERN'S CALLOUTS

2219
2220       int pcre2_callout_enumerate(const pcre2_code *code,
2221         int (*callback)(pcre2_callout_enumerate_block *, void *),
2222         void *user_data);
2223
2224       A script language that supports the use of string arguments in callouts
2225       might like to scan all the callouts in a  pattern  before  running  the
2226       match. This can be done by calling pcre2_callout_enumerate(). The first
2227       argument is a pointer to a compiled pattern, the  second  points  to  a
2228       callback  function,  and the third is arbitrary user data. The callback
2229       function is called for every callout in the pattern  in  the  order  in
2230       which they appear. Its first argument is a pointer to a callout enumer‐
2231       ation block, and its second argument is the user_data  value  that  was
2232       passed  to  pcre2_callout_enumerate(). The contents of the callout enu‐
2233       meration block are described in the pcre2callout  documentation,  which
2234       also gives further details about callouts.
2235

SERIALIZATION AND PRECOMPILING

2237
2238       It  is  possible  to  save  compiled patterns on disc or elsewhere, and
2239       reload them later, subject to a number of  restrictions.  The  host  on
2240       which  the  patterns  are  reloaded must be running the same version of
2241       PCRE2, with the same code unit width, and must also have the same endi‐
2242       anness,  pointer  width,  and PCRE2_SIZE type. Before compiled patterns
2243       can be saved, they must be converted to a "serialized" form,  which  in
2244       the  case of PCRE2 is really just a bytecode dump.  The functions whose
2245       names begin with pcre2_serialize_ are used for converting to  and  from
2246       the  serialized form. They are described in the pcre2serialize documen‐
2247       tation. Note that PCRE2 serialization does not  convert  compiled  pat‐
2248       terns to an abstract format like Java or .NET serialization.
2249

THE MATCH DATA BLOCK

2251
2252       pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
2253         pcre2_general_context *gcontext);
2254
2255       pcre2_match_data *pcre2_match_data_create_from_pattern(
2256         const pcre2_code *code, pcre2_general_context *gcontext);
2257
2258       void pcre2_match_data_free(pcre2_match_data *match_data);
2259
2260       Information  about  a  successful  or unsuccessful match is placed in a
2261       match data block, which is an opaque  structure  that  is  accessed  by
2262       function  calls.  In particular, the match data block contains a vector
2263       of offsets into the subject string that define the matched part of  the
2264       subject  and  any  substrings  that were captured. This is known as the
2265       ovector.
2266
2267       Before calling pcre2_match(), pcre2_dfa_match(),  or  pcre2_jit_match()
2268       you must create a match data block by calling one of the creation func‐
2269       tions above. For pcre2_match_data_create(), the first argument  is  the
2270       number  of  pairs  of  offsets  in  the ovector. One pair of offsets is
2271       required to identify the string that matched the whole pattern, with an
2272       additional  pair for each captured substring. For example, a value of 4
2273       creates enough space to record the matched portion of the subject  plus
2274       three  captured  substrings. A minimum of at least 1 pair is imposed by
2275       pcre2_match_data_create(), so it is always possible to return the over‐
2276       all matched string.
2277
2278       The second argument of pcre2_match_data_create() is a pointer to a gen‐
2279       eral context, which can specify custom memory management for  obtaining
2280       the memory for the match data block. If you are not using custom memory
2281       management, pass NULL, which causes malloc() to be used.
2282
2283       For pcre2_match_data_create_from_pattern(), the  first  argument  is  a
2284       pointer to a compiled pattern. The ovector is created to be exactly the
2285       right size to hold all the substrings a pattern might capture. The sec‐
2286       ond  argument is again a pointer to a general context, but in this case
2287       if NULL is passed, the memory is obtained using the same allocator that
2288       was used for the compiled pattern (custom or default).
2289
2290       A  match  data block can be used many times, with the same or different
2291       compiled patterns. You can extract information from a match data  block
2292       after  a  match  operation  has  finished,  using  functions  that  are
2293       described in the sections on  matched  strings  and  other  match  data
2294       below.
2295
2296       When  a  call  of  pcre2_match()  fails, valid data is available in the
2297       match   block   only   when   the   error    is    PCRE2_ERROR_NOMATCH,
2298       PCRE2_ERROR_PARTIAL,  or  one  of  the  error  codes for an invalid UTF
2299       string. Exactly what is available depends on the error, and is detailed
2300       below.
2301
2302       When  one of the matching functions is called, pointers to the compiled
2303       pattern and the subject string are set in the match data block so  that
2304       they  can  be referenced by the extraction functions after a successful
2305       match. After running a match, you must not free a compiled pattern or a
2306       subject  string until after all operations on the match data block (for
2307       that match) have taken place,  unless,  in  the  case  of  the  subject
2308       string,  you  have used the PCRE2_COPY_MATCHED_SUBJECT option, which is
2309       described in the  section  entitled  "Option  bits  for  pcre2_match()"
2310       below.
2311
2312       When  a match data block itself is no longer needed, it should be freed
2313       by calling pcre2_match_data_free(). If this function is called  with  a
2314       NULL argument, it returns immediately, without doing anything.
2315

MATCHING A PATTERN: THE TRADITIONAL FUNCTION

2317
2318       int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
2319         PCRE2_SIZE length, PCRE2_SIZE startoffset,
2320         uint32_t options, pcre2_match_data *match_data,
2321         pcre2_match_context *mcontext);
2322
2323       The  function pcre2_match() is called to match a subject string against
2324       a compiled pattern, which is passed in the code argument. You can  call
2325       pcre2_match() with the same code argument as many times as you like, in
2326       order to find multiple matches in the subject string or to  match  dif‐
2327       ferent subject strings with the same pattern.
2328
2329       This  function  is  the  main  matching facility of the library, and it
2330       operates in a Perl-like manner. For specialist use  there  is  also  an
2331       alternative  matching function, which is described below in the section
2332       about the pcre2_dfa_match() function.
2333
2334       Here is an example of a simple call to pcre2_match():
2335
2336         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
2337         int rc = pcre2_match(
2338           re,             /* result of pcre2_compile() */
2339           "some string",  /* the subject string */
2340           11,             /* the length of the subject string */
2341           0,              /* start at offset 0 in the subject */
2342           0,              /* default options */
2343           md,             /* the match data block */
2344           NULL);          /* a match context; NULL means use defaults */
2345
2346       If the subject string is zero-terminated, the length can  be  given  as
2347       PCRE2_ZERO_TERMINATED. A match context must be provided if certain less
2348       common matching parameters are to be changed. For details, see the sec‐
2349       tion on the match context above.
2350
2351   The string to be matched by pcre2_match()
2352
2353       The  subject string is passed to pcre2_match() as a pointer in subject,
2354       a length in length, and a starting offset in  startoffset.  The  length
2355       and  offset  are  in  code units, not characters.  That is, they are in
2356       bytes for the 8-bit library, 16-bit code units for the 16-bit  library,
2357       and  32-bit  code units for the 32-bit library, whether or not UTF pro‐
2358       cessing is enabled.
2359
2360       If startoffset is greater than the length of the subject, pcre2_match()
2361       returns  PCRE2_ERROR_BADOFFSET.  When  the starting offset is zero, the
2362       search for a match starts at the beginning of the subject, and this  is
2363       by far the most common case. In UTF-8 or UTF-16 mode, the starting off‐
2364       set must point to the start of a character, or to the end of  the  sub‐
2365       ject  (in  UTF-32 mode, one code unit equals one character, so all off‐
2366       sets are valid). Like the  pattern  string,  the  subject  may  contain
2367       binary zeros.
2368
2369       A  non-zero  starting offset is useful when searching for another match
2370       in the same subject by calling pcre2_match()  again  after  a  previous
2371       success.   Setting  startoffset  differs  from passing over a shortened
2372       string and setting PCRE2_NOTBOL in the case of a  pattern  that  begins
2373       with any kind of lookbehind. For example, consider the pattern
2374
2375         \Biss\B
2376
2377       which  finds  occurrences  of "iss" in the middle of words. (\B matches
2378       only if the current position in the subject is not  a  word  boundary.)
2379       When applied to the string "Mississipi" the first call to pcre2_match()
2380       finds the first occurrence. If pcre2_match() is called again with  just
2381       the  remainder  of  the  subject,  namely  "issipi", it does not match,
2382       because \B is always false at the start of the subject, which is deemed
2383       to  be  a word boundary. However, if pcre2_match() is passed the entire
2384       string again, but with startoffset set to 4, it finds the second occur‐
2385       rence  of "iss" because it is able to look behind the starting point to
2386       discover that it is preceded by a letter.
2387
2388       Finding all the matches in a subject is tricky  when  the  pattern  can
2389       match an empty string. It is possible to emulate Perl's /g behaviour by
2390       first  trying  the  match  again  at  the   same   offset,   with   the
2391       PCRE2_NOTEMPTY_ATSTART  and  PCRE2_ANCHORED  options,  and then if that
2392       fails, advancing the starting  offset  and  trying  an  ordinary  match
2393       again.  There  is  some  code  that  demonstrates how to do this in the
2394       pcre2demo sample program. In the most general case, you have  to  check
2395       to  see  if the newline convention recognizes CRLF as a newline, and if
2396       so, and the current character is CR followed by LF, advance the  start‐
2397       ing offset by two characters instead of one.
2398
2399       If a non-zero starting offset is passed when the pattern is anchored, a
2400       single attempt to match at the given offset is made. This can only suc‐
2401       ceed  if  the  pattern does not require the match to be at the start of
2402       the subject. In other words, the anchoring must be the result  of  set‐
2403       ting  the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL, not
2404       by starting the pattern with ^ or \A.
2405
2406   Option bits for pcre2_match()
2407
2408       The unused bits of the options argument for pcre2_match() must be zero.
2409       The    only    bits    that    may    be    set   are   PCRE2_ANCHORED,
2410       PCRE2_COPY_MATCHED_SUBJECT,      PCRE2_ENDANCHORED,       PCRE2_NOTBOL,
2411       PCRE2_NOTEOL,   PCRE2_NOTEMPTY,  PCRE2_NOTEMPTY_ATSTART,  PCRE2_NO_JIT,
2412       PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and  PCRE2_PARTIAL_SOFT.  Their
2413       action is described below.
2414
2415       Setting  PCRE2_ANCHORED  or PCRE2_ENDANCHORED at match time is not sup‐
2416       ported by the just-in-time (JIT) compiler. If it is set,  JIT  matching
2417       is  disabled  and  the interpretive code in pcre2_match() is run. Apart
2418       from PCRE2_NO_JIT (obviously), the remaining options are supported  for
2419       JIT matching.
2420
2421         PCRE2_ANCHORED
2422
2423       The PCRE2_ANCHORED option limits pcre2_match() to matching at the first
2424       matching position. If a pattern was compiled  with  PCRE2_ANCHORED,  or
2425       turned  out to be anchored by virtue of its contents, it cannot be made
2426       unachored at matching time. Note that setting the option at match  time
2427       disables JIT matching.
2428
2429         PCRE2_COPY_MATCHED_SUBJECT
2430
2431       By  default,  a  pointer to the subject is remembered in the match data
2432       block so that, after a successful match, it can be  referenced  by  the
2433       substring  extraction  functions.  This means that the subject's memory
2434       must not be freed until all such  operations  are  complete.  For  some
2435       applications  where  the  lifetime of the subject string is not guaran‐
2436       teed, it may be necessary to make a copy of the subject string, but  it
2437       is wasteful to do this unless the match is successful. After a success‐
2438       ful match, if PCRE2_COPY_MATCHED_SUBJECT is set, the subject is  copied
2439       and  the  new  pointer is remembered in the match data block instead of
2440       the original subject pointer. The memory allocator that  was  used  for
2441       the  match  block  itself is used. The copy is automatically freed when
2442       pcre2_match_data_free() is called to free the match data block.  It  is
2443       also automatically freed if the match data block is re-used for another
2444       match operation.
2445
2446         PCRE2_ENDANCHORED
2447
2448       If the PCRE2_ENDANCHORED option is set, any string  that  pcre2_match()
2449       matches  must be right at the end of the subject string. Note that set‐
2450       ting the option at match time disables JIT matching.
2451
2452         PCRE2_NOTBOL
2453
2454       This option specifies that first character of the subject string is not
2455       the  beginning  of  a  line, so the circumflex metacharacter should not
2456       match before it. Setting this without  having  set  PCRE2_MULTILINE  at
2457       compile time causes circumflex never to match. This option affects only
2458       the behaviour of the circumflex metacharacter. It does not affect \A.
2459
2460         PCRE2_NOTEOL
2461
2462       This option specifies that the end of the subject string is not the end
2463       of  a line, so the dollar metacharacter should not match it nor (except
2464       in multiline mode) a newline immediately before it. Setting this  with‐
2465       out  having  set PCRE2_MULTILINE at compile time causes dollar never to
2466       match. This option affects only the behaviour of the dollar metacharac‐
2467       ter. It does not affect \Z or \z.
2468
2469         PCRE2_NOTEMPTY
2470
2471       An empty string is not considered to be a valid match if this option is
2472       set. If there are alternatives in the pattern, they are tried.  If  all
2473       the  alternatives  match  the empty string, the entire match fails. For
2474       example, if the pattern
2475
2476         a?b?
2477
2478       is applied to a string not beginning with "a" or  "b",  it  matches  an
2479       empty string at the start of the subject. With PCRE2_NOTEMPTY set, this
2480       match is not valid, so pcre2_match() searches further into  the  string
2481       for occurrences of "a" or "b".
2482
2483         PCRE2_NOTEMPTY_ATSTART
2484
2485       This  is  like PCRE2_NOTEMPTY, except that it locks out an empty string
2486       match only at the first matching position, that is, at the start of the
2487       subject  plus  the  starting offset. An empty string match later in the
2488       subject is permitted.  If the pattern is anchored,  such  a  match  can
2489       occur only if the pattern contains \K.
2490
2491         PCRE2_NO_JIT
2492
2493       By   default,   if   a  pattern  has  been  successfully  processed  by
2494       pcre2_jit_compile(), JIT is automatically used  when  pcre2_match()  is
2495       called  with  options  that JIT supports. Setting PCRE2_NO_JIT disables
2496       the use of JIT; it forces matching to be done by the interpreter.
2497
2498         PCRE2_NO_UTF_CHECK
2499
2500       When PCRE2_UTF is set at compile time, the validity of the subject as a
2501       UTF   string   is   checked  unless  PCRE2_NO_UTF_CHECK  is  passed  to
2502       pcre2_match() or PCRE2_MATCH_INVALID_UTF was passed to pcre2_compile().
2503       The latter special case is discussed in detail in the pcre2unicode doc‐
2504       umentation.
2505
2506       In the default case, if a non-zero starting offset is given, the  check
2507       is  applied  only  to  that part of the subject that could be inspected
2508       during matching, and there is a check that the starting  offset  points
2509       to  the first code unit of a character or to the end of the subject. If
2510       there are no lookbehind assertions in the pattern, the check starts  at
2511       the starting offset.  Otherwise, it starts at the length of the longest
2512       lookbehind before the starting offset, or at the start of  the  subject
2513       if  there are not that many characters before the starting offset. Note
2514       that the sequences \b and \B are one-character lookbehinds.
2515
2516       The check is carried out before any other processing takes place, and a
2517       negative  error  code is returned if the check fails. There are several
2518       UTF error codes for each code unit width,  corresponding  to  different
2519       problems  with  the code unit sequence. There are discussions about the
2520       validity of UTF-8 strings, UTF-16 strings, and UTF-32  strings  in  the
2521       pcre2unicode documentation.
2522
2523       If you know that your subject is valid, and you want to skip this check
2524       for performance reasons, you can set the PCRE2_NO_UTF_CHECK option when
2525       calling  pcre2_match().  You  might  want to do this for the second and
2526       subsequent calls to pcre2_match() if you are making repeated  calls  to
2527       find multiple matches in the same subject string.
2528
2529       Warning:  Unless  PCRE2_MATCH_INVALID_UTF was set at compile time, when
2530       PCRE2_NO_UTF_CHECK is set at  match  time  the  effect  of  passing  an
2531       invalid  string  as  a  subject, or an invalid value of startoffset, is
2532       undefined.  Your program may crash or loop indefinitely or  give  wrong
2533       results.
2534
2535         PCRE2_PARTIAL_HARD
2536         PCRE2_PARTIAL_SOFT
2537
2538       These  options  turn  on  the partial matching feature. A partial match
2539       occurs if the end of the subject string is  reached  successfully,  but
2540       there are not enough subject characters to complete the match. In addi‐
2541       tion, either at least one character must have  been  inspected  or  the
2542       pattern  must  contain  a  lookbehind,  or the pattern must be one that
2543       could match an empty string.
2544
2545       If this situation arises when PCRE2_PARTIAL_SOFT  (but  not  PCRE2_PAR‐
2546       TIAL_HARD) is set, matching continues by testing any remaining alterna‐
2547       tives. Only if no complete match can be  found  is  PCRE2_ERROR_PARTIAL
2548       returned  instead  of  PCRE2_ERROR_NOMATCH.  In other words, PCRE2_PAR‐
2549       TIAL_SOFT specifies that the caller is prepared  to  handle  a  partial
2550       match, but only if no complete match can be found.
2551
2552       If  PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this
2553       case, if a partial match is found,  pcre2_match()  immediately  returns
2554       PCRE2_ERROR_PARTIAL,  without  considering  any  other alternatives. In
2555       other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid‐
2556       ered to be more important that an alternative complete match.
2557
2558       There is a more detailed discussion of partial and multi-segment match‐
2559       ing, with examples, in the pcre2partial documentation.
2560

NEWLINE HANDLING WHEN MATCHING

2562
2563       When PCRE2 is built, a default newline convention is set; this is  usu‐
2564       ally  the standard convention for the operating system. The default can
2565       be overridden in a compile context by calling  pcre2_set_newline().  It
2566       can  also be overridden by starting a pattern string with, for example,
2567       (*CRLF), as described in the section  on  newline  conventions  in  the
2568       pcre2pattern  page. During matching, the newline choice affects the be‐
2569       haviour of the dot, circumflex, and dollar metacharacters. It may  also
2570       alter  the  way  the  match starting position is advanced after a match
2571       failure for an unanchored pattern.
2572
2573       When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is
2574       set  as  the  newline convention, and a match attempt for an unanchored
2575       pattern fails when the current starting position is at a CRLF sequence,
2576       and  the  pattern contains no explicit matches for CR or LF characters,
2577       the match position is advanced by two characters  instead  of  one,  in
2578       other words, to after the CRLF.
2579
2580       The above rule is a compromise that makes the most common cases work as
2581       expected. For example, if the pattern  is  .+A  (and  the  PCRE2_DOTALL
2582       option is not set), it does not match the string "\r\nA" because, after
2583       failing at the start, it skips both the CR and the LF before  retrying.
2584       However,  the  pattern  [\r\n]A does match that string, because it con‐
2585       tains an explicit CR or LF reference, and so advances only by one char‐
2586       acter after the first failure.
2587
2588       An explicit match for CR of LF is either a literal appearance of one of
2589       those characters in the pattern, or one of the \r or \n  or  equivalent
2590       octal or hexadecimal escape sequences. Implicit matches such as [^X] do
2591       not count, nor does \s, even though it includes CR and LF in the  char‐
2592       acters that it matches.
2593
2594       Notwithstanding  the above, anomalous effects may still occur when CRLF
2595       is a valid newline sequence and explicit \r or \n escapes appear in the
2596       pattern.
2597

HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

2599
2600       uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
2601
2602       PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
2603
2604       In  general, a pattern matches a certain portion of the subject, and in
2605       addition, further substrings from the subject  may  be  picked  out  by
2606       parenthesized  parts  of  the  pattern.  Following the usage in Jeffrey
2607       Friedl's book, this is called "capturing"  in  what  follows,  and  the
2608       phrase  "capture  group" (Perl terminology) is used for a fragment of a
2609       pattern that picks out a substring. PCRE2 supports several other  kinds
2610       of parenthesized group that do not cause substrings to be captured. The
2611       pcre2_pattern_info() function can be used to find out how many  capture
2612       groups there are in a compiled pattern.
2613
2614       You  can  use  auxiliary functions for accessing captured substrings by
2615       number or by name, as described in sections below.
2616
2617       Alternatively, you can make direct use of the vector of PCRE2_SIZE val‐
2618       ues,  called  the  ovector,  which  contains  the  offsets  of captured
2619       strings.  It  is  part  of  the  match  data   block.    The   function
2620       pcre2_get_ovector_pointer()  returns  the  address  of the ovector, and
2621       pcre2_get_ovector_count() returns the number of pairs of values it con‐
2622       tains.
2623
2624       Within the ovector, the first in each pair of values is set to the off‐
2625       set of the first code unit of a substring, and the second is set to the
2626       offset  of the first code unit after the end of a substring. These val‐
2627       ues are always code unit offsets, not character offsets. That is,  they
2628       are  byte  offsets  in  the 8-bit library, 16-bit offsets in the 16-bit
2629       library, and 32-bit offsets in the 32-bit library.
2630
2631       After a partial match  (error  return  PCRE2_ERROR_PARTIAL),  only  the
2632       first  pair  of  offsets  (that is, ovector[0] and ovector[1]) are set.
2633       They identify the part of the subject that was partially  matched.  See
2634       the pcre2partial documentation for details of partial matching.
2635
2636       After  a  fully  successful match, the first pair of offsets identifies
2637       the portion of the subject string that was matched by the  entire  pat‐
2638       tern.  The  next  pair is used for the first captured substring, and so
2639       on. The value returned by pcre2_match() is one more  than  the  highest
2640       numbered  pair  that  has been set. For example, if two substrings have
2641       been captured, the returned value is 3. If there are no  captured  sub‐
2642       strings, the return value from a successful match is 1, indicating that
2643       just the first pair of offsets has been set.
2644
2645       If a pattern uses the \K escape sequence within a  positive  assertion,
2646       the reported start of a successful match can be greater than the end of
2647       the match.  For example, if the pattern  (?=ab\K)  is  matched  against
2648       "ab", the start and end offset values for the match are 2 and 0.
2649
2650       If  a  capture group is matched repeatedly within a single match opera‐
2651       tion, it is the last portion of the subject that  it  matched  that  is
2652       returned.
2653
2654       If the ovector is too small to hold all the captured substring offsets,
2655       as much as possible is filled in, and the function returns a  value  of
2656       zero.  If captured substrings are not of interest, pcre2_match() may be
2657       called with a match data block whose ovector is of minimum length (that
2658       is, one pair).
2659
2660       It  is  possible for capture group number n+1 to match some part of the
2661       subject when group n has not been used at  all.  For  example,  if  the
2662       string "abc" is matched against the pattern (a|(z))(bc) the return from
2663       the function is 4, and groups 1 and 3 are matched, but 2 is  not.  When
2664       this  happens,  both values in the offset pairs corresponding to unused
2665       groups are set to PCRE2_UNSET.
2666
2667       Offset values that correspond to  unused  groups  at  the  end  of  the
2668       expression  are  also  set  to  PCRE2_UNSET. For example, if the string
2669       "abc" is matched against the pattern (abc)(x(yz)?)? groups 2 and 3  are
2670       not  matched.  The  return  from the function is 2, because the highest
2671       used capture group number is 1. The offsets  for  for  the  second  and
2672       third  capture groupss (assuming the vector is large enough, of course)
2673       are set to PCRE2_UNSET.
2674
2675       Elements in the ovector that do not correspond to capturing parentheses
2676       in the pattern are never changed. That is, if a pattern contains n cap‐
2677       turing parentheses, no more than ovector[0] to ovector[2n+1] are set by
2678       pcre2_match().  The  other  elements retain whatever values they previ‐
2679       ously had. After a failed match attempt, the contents  of  the  ovector
2680       are unchanged.
2681

OTHER INFORMATION ABOUT A MATCH

2683
2684       PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
2685
2686       PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
2687
2688       As  well as the offsets in the ovector, other information about a match
2689       is retained in the match data block and can be retrieved by  the  above
2690       functions  in  appropriate  circumstances.  If they are called at other
2691       times, the result is undefined.
2692
2693       After a successful match, a partial match (PCRE2_ERROR_PARTIAL),  or  a
2694       failure  to  match (PCRE2_ERROR_NOMATCH), a mark name may be available.
2695       The function pcre2_get_mark() can be called to access this name,  which
2696       can  be  specified  in  the  pattern by any of the backtracking control
2697       verbs, not just (*MARK). The same function applies to all the verbs. It
2698       returns a pointer to the zero-terminated name, which is within the com‐
2699       piled pattern. If no name is available, NULL is returned. The length of
2700       the  name  (excluding  the terminating zero) is stored in the code unit
2701       that precedes the name. You should use this length instead  of  relying
2702       on the terminating zero if the name might contain a binary zero.
2703
2704       After  a  successful  match, the name that is returned is the last mark
2705       name encountered on the matching path through the pattern. Instances of
2706       backtracking  verbs  without  names do not count. Thus, for example, if
2707       the matching path contains (*MARK:A)(*PRUNE), the name "A" is returned.
2708       After  a  "no  match"  or a partial match, the last encountered name is
2709       returned. For example, consider this pattern:
2710
2711         ^(*MARK:A)((*MARK:B)a|b)c
2712
2713       When it matches "bc", the returned name is A. The B mark is  "seen"  in
2714       the  first  branch of the group, but it is not on the matching path. On
2715       the other hand, when this pattern fails to  match  "bx",  the  returned
2716       name is B.
2717
2718       Warning:  By  default, certain start-of-match optimizations are used to
2719       give a fast "no match" result in some situations. For example,  if  the
2720       anchoring  is removed from the pattern above, there is an initial check
2721       for the presence of "c" in the  subject  before  running  the  matching
2722       engine. This check fails for "bx", causing a match failure without see‐
2723       ing any marks. You can disable the start-of-match optimizations by set‐
2724       ting  the  PCRE2_NO_START_OPTIMIZE  option  for  pcre2_compile()  or by
2725       starting the pattern with (*NO_START_OPT).
2726
2727       After a successful match, a partial match, or one of  the  invalid  UTF
2728       errors  (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can
2729       be called. After a successful or partial match it returns the code unit
2730       offset  of  the character at which the match started. For a non-partial
2731       match, this can be different to the value of ovector[0] if the  pattern
2732       contains  the  \K escape sequence. After a partial match, however, this
2733       value is always the same as ovector[0] because \K does not  affect  the
2734       result of a partial match.
2735
2736       After  a UTF check failure, pcre2_get_startchar() can be used to obtain
2737       the code unit offset of the invalid UTF character. Details are given in
2738       the pcre2unicode page.
2739

ERROR RETURNS FROM pcre2_match()

2741
2742       If  pcre2_match() fails, it returns a negative number. This can be con‐
2743       verted to a text string by calling the pcre2_get_error_message()  func‐
2744       tion  (see  "Obtaining a textual error message" below).  Negative error
2745       codes are also returned by other functions,  and  are  documented  with
2746       them.  The codes are given names in the header file. If UTF checking is
2747       in force and an invalid UTF subject string is detected, one of a number
2748       of  UTF-specific negative error codes is returned. Details are given in
2749       the pcre2unicode page. The following are the other errors that  may  be
2750       returned by pcre2_match():
2751
2752         PCRE2_ERROR_NOMATCH
2753
2754       The subject string did not match the pattern.
2755
2756         PCRE2_ERROR_PARTIAL
2757
2758       The  subject  string did not match, but it did match partially. See the
2759       pcre2partial documentation for details of partial matching.
2760
2761         PCRE2_ERROR_BADMAGIC
2762
2763       PCRE2 stores a 4-byte "magic number" at the start of the compiled code,
2764       to  catch  the case when it is passed a junk pointer. This is the error
2765       that is returned when the magic number is not present.
2766
2767         PCRE2_ERROR_BADMODE
2768
2769       This error is given when a compiled pattern is passed to a function  in
2770       a  library  of a different code unit width, for example, a pattern com‐
2771       piled by the 8-bit library is passed to  a  16-bit  or  32-bit  library
2772       function.
2773
2774         PCRE2_ERROR_BADOFFSET
2775
2776       The value of startoffset was greater than the length of the subject.
2777
2778         PCRE2_ERROR_BADOPTION
2779
2780       An unrecognized bit was set in the options argument.
2781
2782         PCRE2_ERROR_BADUTFOFFSET
2783
2784       The UTF code unit sequence that was passed as a subject was checked and
2785       found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but  the
2786       value  of startoffset did not point to the beginning of a UTF character
2787       or the end of the subject.
2788
2789         PCRE2_ERROR_CALLOUT
2790
2791       This error is never generated by pcre2_match() itself. It  is  provided
2792       for  use  by  callout  functions  that  want  to cause pcre2_match() or
2793       pcre2_callout_enumerate() to return a distinctive error code.  See  the
2794       pcre2callout documentation for details.
2795
2796         PCRE2_ERROR_DEPTHLIMIT
2797
2798       The nested backtracking depth limit was reached.
2799
2800         PCRE2_ERROR_HEAPLIMIT
2801
2802       The heap limit was reached.
2803
2804         PCRE2_ERROR_INTERNAL
2805
2806       An  unexpected  internal error has occurred. This error could be caused
2807       by a bug in PCRE2 or by overwriting of the compiled pattern.
2808
2809         PCRE2_ERROR_JIT_STACKLIMIT
2810
2811       This error is returned when a pattern  that  was  successfully  studied
2812       using  JIT  is being matched, but the memory available for the just-in-
2813       time processing stack is not large enough. See the pcre2jit  documenta‐
2814       tion for more details.
2815
2816         PCRE2_ERROR_MATCHLIMIT
2817
2818       The backtracking match limit was reached.
2819
2820         PCRE2_ERROR_NOMEMORY
2821
2822       If  a  pattern contains many nested backtracking points, heap memory is
2823       used to remember them. This error is given when the  memory  allocation
2824       function  (default  or  custom)  fails.  Note  that  a different error,
2825       PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed  exceeds
2826       the    heap   limit.   PCRE2_ERROR_NOMEMORY   is   also   returned   if
2827       PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
2828
2829         PCRE2_ERROR_NULL
2830
2831       Either the code, subject, or match_data argument was passed as NULL.
2832
2833         PCRE2_ERROR_RECURSELOOP
2834
2835       This error is returned when  pcre2_match()  detects  a  recursion  loop
2836       within  the  pattern. Specifically, it means that either the whole pat‐
2837       tern or a capture group has been called recursively for the second time
2838       at  the  same position in the subject string. Some simple patterns that
2839       might do this are detected and faulted at compile time, but  more  com‐
2840       plicated  cases,  in particular mutual recursions between two different
2841       groups, cannot be detected until matching is attempted.
2842

OBTAINING A TEXTUAL ERROR MESSAGE

2844
2845       int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
2846         PCRE2_SIZE bufflen);
2847
2848       A text message for an error code  from  any  PCRE2  function  (compile,
2849       match,  or  auxiliary)  can be obtained by calling pcre2_get_error_mes‐
2850       sage(). The code is passed as the first argument,  with  the  remaining
2851       two  arguments  specifying  a  code  unit buffer and its length in code
2852       units, into which the text message is placed. The message  is  returned
2853       in  code  units  of the appropriate width for the library that is being
2854       used.
2855
2856       The returned message is terminated with a trailing zero, and the  func‐
2857       tion  returns  the  number  of  code units used, excluding the trailing
2858       zero.  If  the  error  number  is  unknown,  the  negative  error  code
2859       PCRE2_ERROR_BADDATA  is  returned. If the buffer is too small, the mes‐
2860       sage is truncated (but still with a trailing zero),  and  the  negative
2861       error  code PCRE2_ERROR_NOMEMORY is returned.  None of the messages are
2862       very long; a buffer size of 120 code units is ample.
2863

EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

2865
2866       int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
2867         uint32_t number, PCRE2_SIZE *length);
2868
2869       int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
2870         uint32_t number, PCRE2_UCHAR *buffer,
2871         PCRE2_SIZE *bufflen);
2872
2873       int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
2874         uint32_t number, PCRE2_UCHAR **bufferptr,
2875         PCRE2_SIZE *bufflen);
2876
2877       void pcre2_substring_free(PCRE2_UCHAR *buffer);
2878
2879       Captured substrings can be accessed directly by using  the  ovector  as
2880       described above.  For convenience, auxiliary functions are provided for
2881       extracting  captured  substrings  as  new,  separate,   zero-terminated
2882       strings. A substring that contains a binary zero is correctly extracted
2883       and has a further zero added on the end, but  the  result  is  not,  of
2884       course, a C string.
2885
2886       The functions in this section identify substrings by number. The number
2887       zero refers to the entire matched substring, with higher numbers refer‐
2888       ring  to  substrings  captured by parenthesized groups. After a partial
2889       match, only substring zero is available.  An  attempt  to  extract  any
2890       other  substring  gives the error PCRE2_ERROR_PARTIAL. The next section
2891       describes similar functions for extracting captured substrings by name.
2892
2893       If a pattern uses the \K escape sequence within a  positive  assertion,
2894       the reported start of a successful match can be greater than the end of
2895       the match.  For example, if the pattern  (?=ab\K)  is  matched  against
2896       "ab",  the  start  and  end offset values for the match are 2 and 0. In
2897       this situation, calling these functions with a  zero  substring  number
2898       extracts a zero-length empty string.
2899
2900       You  can  find the length in code units of a captured substring without
2901       extracting it by calling pcre2_substring_length_bynumber().  The  first
2902       argument  is a pointer to the match data block, the second is the group
2903       number, and the third is a pointer to a variable into which the  length
2904       is  placed.  If  you just want to know whether or not the substring has
2905       been captured, you can pass the third argument as NULL.
2906
2907       The pcre2_substring_copy_bynumber() function  copies  a  captured  sub‐
2908       string  into  a supplied buffer, whereas pcre2_substring_get_bynumber()
2909       copies it into new memory, obtained using the  same  memory  allocation
2910       function  that  was  used for the match data block. The first two argu‐
2911       ments of these functions are a pointer to the match data  block  and  a
2912       capture group number.
2913
2914       The final arguments of pcre2_substring_copy_bynumber() are a pointer to
2915       the buffer and a pointer to a variable that contains its length in code
2916       units.  This is updated to contain the actual number of code units used
2917       for the extracted substring, excluding the terminating zero.
2918
2919       For pcre2_substring_get_bynumber() the third and fourth arguments point
2920       to  variables that are updated with a pointer to the new memory and the
2921       number of code units that comprise the substring, again  excluding  the
2922       terminating  zero.  When  the substring is no longer needed, the memory
2923       should be freed by calling pcre2_substring_free().
2924
2925       The return value from all these functions is zero  for  success,  or  a
2926       negative  error  code.  If  the pattern match failed, the match failure
2927       code is returned.  If a substring number  greater  than  zero  is  used
2928       after  a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible
2929       error codes are:
2930
2931         PCRE2_ERROR_NOMEMORY
2932
2933       The buffer was too small for  pcre2_substring_copy_bynumber(),  or  the
2934       attempt to get memory failed for pcre2_substring_get_bynumber().
2935
2936         PCRE2_ERROR_NOSUBSTRING
2937
2938       There  is  no  substring  with that number in the pattern, that is, the
2939       number is greater than the number of capturing parentheses.
2940
2941         PCRE2_ERROR_UNAVAILABLE
2942
2943       The substring number, though not greater than the number of captures in
2944       the pattern, is greater than the number of slots in the ovector, so the
2945       substring could not be captured.
2946
2947         PCRE2_ERROR_UNSET
2948
2949       The substring did not participate in the match.  For  example,  if  the
2950       pattern  is  (abc)|(def) and the subject is "def", and the ovector con‐
2951       tains at least two capturing slots, substring number 1 is unset.
2952

EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

2954
2955       int pcre2_substring_list_get(pcre2_match_data *match_data,
2956         PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
2957
2958       void pcre2_substring_list_free(PCRE2_SPTR *list);
2959
2960       The pcre2_substring_list_get() function  extracts  all  available  sub‐
2961       strings  and  builds  a  list of pointers to them. It also (optionally)
2962       builds a second list that  contains  their  lengths  (in  code  units),
2963       excluding a terminating zero that is added to each of them. All this is
2964       done in a single block of memory that is obtained using the same memory
2965       allocation function that was used to get the match data block.
2966
2967       This  function  must be called only after a successful match. If called
2968       after a partial match, the error code PCRE2_ERROR_PARTIAL is returned.
2969
2970       The address of the memory block is returned via listptr, which is  also
2971       the start of the list of string pointers. The end of the list is marked
2972       by a NULL pointer. The address of the list of lengths is  returned  via
2973       lengthsptr.  If your strings do not contain binary zeros and you do not
2974       therefore need the lengths, you may supply NULL as the lengthsptr argu‐
2975       ment  to  disable  the  creation of a list of lengths. The yield of the
2976       function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the  mem‐
2977       ory  block could not be obtained. When the list is no longer needed, it
2978       should be freed by calling pcre2_substring_list_free().
2979
2980       If this function encounters a substring that is unset, which can happen
2981       when  capture  group  number  n+1 matches some part of the subject, but
2982       group n has not been used at all, it returns an empty string. This  can
2983       be distinguished from a genuine zero-length substring by inspecting the
2984       appropriate offset in the ovector, which contain PCRE2_UNSET for  unset
2985       substrings, or by calling pcre2_substring_length_bynumber().
2986

EXTRACTING CAPTURED SUBSTRINGS BY NAME

2988
2989       int pcre2_substring_number_from_name(const pcre2_code *code,
2990         PCRE2_SPTR name);
2991
2992       int pcre2_substring_length_byname(pcre2_match_data *match_data,
2993         PCRE2_SPTR name, PCRE2_SIZE *length);
2994
2995       int pcre2_substring_copy_byname(pcre2_match_data *match_data,
2996         PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
2997
2998       int pcre2_substring_get_byname(pcre2_match_data *match_data,
2999         PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
3000
3001       void pcre2_substring_free(PCRE2_UCHAR *buffer);
3002
3003       To  extract a substring by name, you first have to find associated num‐
3004       ber.  For example, for this pattern:
3005
3006         (a+)b(?<xxx>\d+)...
3007
3008       the number of the capture group called "xxx" is 2. If the name is known
3009       to be unique (PCRE2_DUPNAMES was not set), you can find the number from
3010       the name by calling pcre2_substring_number_from_name(). The first argu‐
3011       ment  is the compiled pattern, and the second is the name. The yield of
3012       the function is the group number, PCRE2_ERROR_NOSUBSTRING if  there  is
3013       no  group  with that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is
3014       more than one group with that name.  Given the number, you can  extract
3015       the  substring  directly from the ovector, or use one of the "bynumber"
3016       functions described above.
3017
3018       For convenience, there are also "byname" functions that  correspond  to
3019       the  "bynumber"  functions,  the  only difference being that the second
3020       argument is a name instead of a number. If PCRE2_DUPNAMES  is  set  and
3021       there are duplicate names, these functions scan all the groups with the
3022       given name, and return the captured  substring  from  the  first  named
3023       group that is set.
3024
3025       If  there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
3026       returned. If all groups with the name have  numbers  that  are  greater
3027       than  the  number  of  slots in the ovector, PCRE2_ERROR_UNAVAILABLE is
3028       returned. If there is at least one group with a slot  in  the  ovector,
3029       but no group is found to be set, PCRE2_ERROR_UNSET is returned.
3030
3031       Warning: If the pattern uses the (?| feature to set up multiple capture
3032       groups with the same number, as described in the section  on  duplicate
3033       group numbers in the pcre2pattern page, you cannot use names to distin‐
3034       guish the different capture groups, because names are not  included  in
3035       the  compiled  code.  The  matching process uses only numbers. For this
3036       reason, the use of different names for  groups  with  the  same  number
3037       causes an error at compile time.
3038

CREATING A NEW STRING WITH SUBSTITUTIONS

3040
3041       int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
3042         PCRE2_SIZE length, PCRE2_SIZE startoffset,
3043         uint32_t options, pcre2_match_data *match_data,
3044         pcre2_match_context *mcontext, PCRE2_SPTR replacement,
3045         PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
3046         PCRE2_SIZE *outlengthptr);
3047
3048       This  function  optionally calls pcre2_match() and then makes a copy of
3049       the subject string in outputbuffer, replacing parts that  were  matched
3050       with  the replacement string, whose length is supplied in rlength. This
3051       can be given as PCRE2_ZERO_TERMINATED  for  a  zero-terminated  string.
3052       There  is  an  option  (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to
3053       return just the replacement string(s). The default action is to perform
3054       just  one  replacement  if  the pattern matches, but there is an option
3055       that  requests  multiple  replacements   (see   PCRE2_SUBSTITUTE_GLOBAL
3056       below).
3057
3058       If  successful,  pcre2_substitute() returns the number of substitutions
3059       that were carried out. This may be zero if no match was found,  and  is
3060       never  greater  than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A nega‐
3061       tive value is returned if an error is detected.
3062
3063       Matches in which a \K item in a lookahead in  the  pattern  causes  the
3064       match  to  end  before it starts are not supported, and give rise to an
3065       error return. For global replacements, matches in which \K in a lookbe‐
3066       hind  causes the match to start earlier than the point that was reached
3067       in the previous iteration are also not supported.
3068
3069       The first seven arguments of pcre2_substitute() are  the  same  as  for
3070       pcre2_match(), except that the partial matching options are not permit‐
3071       ted, and match_data may be passed as NULL, in which case a  match  data
3072       block  is obtained and freed within this function, using memory manage‐
3073       ment functions from the match context, if provided, or else those  that
3074       were used to allocate memory for the compiled code.
3075
3076       If  match_data is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the
3077       provided block is used for all calls to pcre2_match(), and its contents
3078       afterwards  are  the result of the final call. For global changes, this
3079       will always be a no-match error. The contents of the ovector within the
3080       match data block may or may not have been changed.
3081
3082       As  well as the usual options for pcre2_match(), a number of additional
3083       options can be set in the options argument of pcre2_substitute().   One
3084       such  option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
3085       match_data block must be provided, and it must have been  used  for  an
3086       external  call  to  pcre2_match().  The  data  in  the match_data block
3087       (return code, offset vector) is used for the first substitution instead
3088       of calling pcre2_match() from within pcre2_substitute(). This allows an
3089       application to check for a match before choosing to substitute, without
3090       having to repeat the match.
3091
3092       The  contents  of  the  externally  supplied  match  data block are not
3093       changed  when  PCRE2_SUBSTITUTE_MATCHED  is   set.   If   PCRE2_SUBSTI‐
3094       TUTE_GLOBAL  is  also set, pcre2_match() is called after the first sub‐
3095       stitution to check for further matches,  but  this  is  done  using  an
3096       internally  obtained match data block, thus always leaving the external
3097       block unchanged.
3098
3099       The code argument is not used for matching before the  first  substitu‐
3100       tion  when  PCRE2_SUBSTITUTE_MATCHED  is  set, but it must be provided,
3101       even when PCRE2_SUBSTITUTE_GLOBAL  is  not  set,  because  it  contains
3102       information  such as the UTF setting and the number of capturing paren‐
3103       theses in the pattern.
3104
3105       The default action of pcre2_substitute() is to return  a  copy  of  the
3106       subject string with matched substrings replaced. However, if PCRE2_SUB‐
3107       STITUTE_REPLACEMENT_ONLY is set, only the  replacement  substrings  are
3108       returned. In the global case, multiple replacements are concatenated in
3109       the output buffer. Substitution callouts (see below)  can  be  used  to
3110       separate them if necessary.
3111
3112       The  outlengthptr  argument of pcre2_substitute() must point to a vari‐
3113       able that contains the length, in code units, of the output buffer.  If
3114       the  function is successful, the value is updated to contain the length
3115       in code units of the new string, excluding the trailing  zero  that  is
3116       automatically added.
3117
3118       If  the  function  is  not  successful,  the value set via outlengthptr
3119       depends on the type of error. For  syntax  errors  in  the  replacement
3120       string,  the  value  is  the offset in the replacement string where the
3121       error was detected. For other  errors,  the  value  is  PCRE2_UNSET  by
3122       default.  This  includes the case of the output buffer being too small,
3123       unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set.
3124
3125       PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when  the  output
3126       buffer is too small. The default action is to return PCRE2_ERROR_NOMEM‐
3127       ORY immediately. If this option  is  set,  however,  pcre2_substitute()
3128       continues to go through the motions of matching and substituting (with‐
3129       out, of course, writing anything) in order to compute the size of  buf‐
3130       fer  that  is  needed.  This  value is passed back via the outlengthptr
3131       variable,   with   the   result   of   the   function    still    being
3132       PCRE2_ERROR_NOMEMORY.
3133
3134       Passing  a  buffer  size  of zero is a permitted way of finding out how
3135       much memory is needed for given substitution. However, this  does  mean
3136       that the entire operation is carried out twice. Depending on the appli‐
3137       cation, it may be more efficient to allocate a large  buffer  and  free
3138       the   excess   afterwards,   instead  of  using  PCRE2_SUBSTITUTE_OVER‐
3139       FLOW_LENGTH.
3140
3141       The replacement string, which is interpreted as a  UTF  string  in  UTF
3142       mode,  is checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An
3143       invalid UTF replacement string causes an immediate return with the rel‐
3144       evant UTF error code.
3145
3146       If  PCRE2_SUBSTITUTE_LITERAL  is  set,  the  replacement  string is not
3147       interpreted in any way. By default, however, a dollar character  is  an
3148       escape character that can specify the insertion of characters from cap‐
3149       ture groups and names from (*MARK) or other control verbs in  the  pat‐
3150       tern. The following forms are always recognized:
3151
3152         $$                  insert a dollar character
3153         $<n> or ${<n>}      insert the contents of group <n>
3154         $*MARK or ${*MARK}  insert a control verb name
3155
3156       Either  a  group  number  or  a  group name can be given for <n>. Curly
3157       brackets are required only if the following character would  be  inter‐
3158       preted as part of the number or name. The number may be zero to include
3159       the entire matched string.   For  example,  if  the  pattern  a(b)c  is
3160       matched  with "=abc=" and the replacement string "+$1$0$1+", the result
3161       is "=+babcb+=".
3162
3163       $*MARK inserts the name from the last encountered backtracking  control
3164       verb  on the matching path that has a name. (*MARK) must always include
3165       a name, but the other verbs need not.  For  example,  in  the  case  of
3166       (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B)
3167       the relevant name is "B". This facility can be used to  perform  simple
3168       simultaneous substitutions, as this pcre2test example shows:
3169
3170         /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
3171             apple lemon
3172          2: pear orange
3173
3174       PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject
3175       string, replacing every matching substring. If this option is not  set,
3176       only  the  first matching substring is replaced. The search for matches
3177       takes place in the original subject string (that is, previous  replace‐
3178       ments  do  not  affect  it).  Iteration is implemented by advancing the
3179       startoffset value for each search, which is always  passed  the  entire
3180       subject string. If an offset limit is set in the match context, search‐
3181       ing stops when that limit is reached.
3182
3183       You can restrict the effect of a global substitution to  a  portion  of
3184       the subject string by setting either or both of startoffset and an off‐
3185       set limit. Here is a pcre2test example:
3186
3187         /B/g,replace=!,use_offset_limit
3188         ABC ABC ABC ABC\=offset=3,offset_limit=12
3189          2: ABC A!C A!C ABC
3190
3191       When continuing with global substitutions after  matching  a  substring
3192       with zero length, an attempt to find a non-empty match at the same off‐
3193       set is performed.  If this is not successful, the offset is advanced by
3194       one character except when CRLF is a valid newline sequence and the next
3195       two characters are CR, LF. In this case, the offset is advanced by  two
3196       characters.
3197
3198       PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that
3199       do not appear in the pattern to be treated as unset groups. This option
3200       should  be used with care, because it means that a typo in a group name
3201       or number no longer causes the PCRE2_ERROR_NOSUBSTRING error.
3202
3203       PCRE2_SUBSTITUTE_UNSET_EMPTY causes  unset  capture  groups  (including
3204       unknown  groups  when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)  to  be
3205       treated as empty strings when inserted  as  described  above.  If  this
3206       option  is  not  set,  an  attempt  to insert an unset group causes the
3207       PCRE2_ERROR_UNSET error. This option does not  influence  the  extended
3208       substitution syntax described below.
3209
3210       PCRE2_SUBSTITUTE_EXTENDED  causes extra processing to be applied to the
3211       replacement string. Without this option, only the dollar  character  is
3212       special,  and  only  the  group insertion forms listed above are valid.
3213       When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
3214
3215       Firstly, backslash in a replacement string is interpreted as an  escape
3216       character. The usual forms such as \n or \x{ddd} can be used to specify
3217       particular character codes, and backslash followed by any  non-alphanu‐
3218       meric  character  quotes  that character. Extended quoting can be coded
3219       using \Q...\E, exactly as in pattern strings.
3220
3221       There are also four escape sequences for forcing the case  of  inserted
3222       letters.   The  insertion  mechanism has three states: no case forcing,
3223       force upper case, and force lower case. The escape sequences change the
3224       current state: \U and \L change to upper or lower case forcing, respec‐
3225       tively, and \E (when not terminating a \Q quoted sequence)  reverts  to
3226       no  case  forcing. The sequences \u and \l force the next character (if
3227       it is a letter) to upper or lower  case,  respectively,  and  then  the
3228       state automatically reverts to no case forcing. Case forcing applies to
3229       all inserted  characters, including those from capture groups and  let‐
3230       ters  within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP
3231       was set when the pattern was compiled, Unicode properties are used  for
3232       case forcing characters whose code points are greater than 127.
3233
3234       Note that case forcing sequences such as \U...\E do not nest. For exam‐
3235       ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc";  the  final
3236       \E   has   no   effect.   Note   also   that   the  PCRE2_ALT_BSUX  and
3237       PCRE2_EXTRA_ALT_BSUX options do not apply to replacement strings.
3238
3239       The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to  add  more
3240       flexibility  to  capture  group  substitution. The syntax is similar to
3241       that used by Bash:
3242
3243         ${<n>:-<string>}
3244         ${<n>:+<string1>:<string2>}
3245
3246       As before, <n> may be a group number or a name. The first  form  speci‐
3247       fies  a  default  value. If group <n> is set, its value is inserted; if
3248       not, <string> is expanded and the  result  inserted.  The  second  form
3249       specifies  strings that are expanded and inserted when group <n> is set
3250       or unset, respectively. The first form is just a  convenient  shorthand
3251       for
3252
3253         ${<n>:+${<n>}:<string>}
3254
3255       Backslash  can  be  used to escape colons and closing curly brackets in
3256       the replacement strings. A change of the case forcing  state  within  a
3257       replacement  string  remains  in  force  afterwards,  as  shown in this
3258       pcre2test example:
3259
3260         /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
3261             body
3262          1: hello
3263             somebody
3264          1: HELLO
3265
3266       The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these  extended
3267       substitutions.   However,   PCRE2_SUBSTITUTE_UNKNOWN_UNSET  does  cause
3268       unknown groups in the extended syntax forms to be treated as unset.
3269
3270       If  PCRE2_SUBSTITUTE_LITERAL  is  set,  PCRE2_SUBSTITUTE_UNKNOWN_UNSET,
3271       PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrele‐
3272       vant and are ignored.
3273
3274   Substitution errors
3275
3276       In the event of an error, pcre2_substitute() returns a  negative  error
3277       code.  Except for PCRE2_ERROR_NOMATCH (which is never returned), errors
3278       from pcre2_match() are passed straight back.
3279
3280       PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser‐
3281       tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
3282
3283       PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ‐
3284       ing an unknown substring when  PCRE2_SUBSTITUTE_UNKNOWN_UNSET  is  set)
3285       when  the  simple  (non-extended)  syntax  is  used  and  PCRE2_SUBSTI‐
3286       TUTE_UNSET_EMPTY is not set.
3287
3288       PCRE2_ERROR_NOMEMORY is returned  if  the  output  buffer  is  not  big
3289       enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size
3290       of buffer that is needed is returned via outlengthptr. Note  that  this
3291       does not happen by default.
3292
3293       PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
3294       match_data argument is NULL.
3295
3296       PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax  errors  in
3297       the   replacement   string,   with   more   particular   errors   being
3298       PCRE2_ERROR_BADREPESCAPE (invalid  escape  sequence),  PCRE2_ERROR_REP‐
3299       MISSINGBRACE  (closing curly bracket not found), PCRE2_ERROR_BADSUBSTI‐
3300       TUTION   (syntax   error   in   extended   group   substitution),   and
3301       PCRE2_ERROR_BADSUBSPATTERN  (the  pattern match ended before it started
3302       or the match started earlier than the current position in the  subject,
3303       which can happen if \K is used in an assertion).
3304
3305       As for all PCRE2 errors, a text message that describes the error can be
3306       obtained  by  calling  the  pcre2_get_error_message()   function   (see
3307       "Obtaining a textual error message" above).
3308
3309   Substitution callouts
3310
3311       int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
3312         int (*callout_function)(pcre2_substitute_callout_block *, void *),
3313         void *callout_data);
3314
3315       The  pcre2_set_substitution_callout() function can be used to specify a
3316       callout function for pcre2_substitute(). This information is passed  in
3317       a match context. The callout function is called after each substitution
3318       has been processed, but it can cause the replacement not to happen. The
3319       callout  function is not called for simulated substitutions that happen
3320       as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option.
3321
3322       The first argument of the callout function is a pointer to a substitute
3323       callout  block structure, which contains the following fields, not nec‐
3324       essarily in this order:
3325
3326         uint32_t    version;
3327         uint32_t    subscount;
3328         PCRE2_SPTR  input;
3329         PCRE2_SPTR  output;
3330         PCRE2_SIZE *ovector;
3331         uint32_t    oveccount;
3332         PCRE2_SIZE  output_offsets[2];
3333
3334       The version field contains the version number of the block format.  The
3335       current  version  is  0.  The version number will increase in future if
3336       more fields are added, but the intention is never to remove any of  the
3337       existing fields.
3338
3339       The subscount field is the number of the current match. It is 1 for the
3340       first callout, 2 for the second, and so on. The input and output point‐
3341       ers are copies of the values passed to pcre2_substitute().
3342
3343       The  ovector  field points to the ovector, which contains the result of
3344       the most recent match. The oveccount field contains the number of pairs
3345       that are set in the ovector, and is always greater than zero.
3346
3347       The  output_offsets  vector  contains the offsets of the replacement in
3348       the output string. This has already been processed for dollar  and  (if
3349       requested) backslash substitutions as described above.
3350
3351       The  second  argument  of  the  callout function is the value passed as
3352       callout_data when the function was registered. The  value  returned  by
3353       the callout function is interpreted as follows:
3354
3355       If  the  value is zero, the replacement is accepted, and, if PCRE2_SUB‐
3356       STITUTE_GLOBAL is set, processing continues with a search for the  next
3357       match.  If  the  value  is  not  zero,  the  current replacement is not
3358       accepted. If the value is greater than zero, processing continues  when
3359       PCRE2_SUBSTITUTE_GLOBAL  is set. Otherwise (the value is less than zero
3360       or PCRE2_SUBSTITUTE_GLOBAL is not set), the the rest of  the  input  is
3361       copied  to the output and the call to pcre2_substitute() exits, return‐
3362       ing the number of matches so far.
3363

DUPLICATE CAPTURE GROUP NAMES

3365
3366       int pcre2_substring_nametable_scan(const pcre2_code *code,
3367         PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
3368
3369       When a pattern is compiled with the PCRE2_DUPNAMES  option,  names  for
3370       capture  groups  are  not  required  to  be unique. Duplicate names are
3371       always allowed for groups with the same number, created  by  using  the
3372       (?| feature. Indeed, if such groups are named, they are required to use
3373       the same names.
3374
3375       Normally, patterns that use duplicate names are such that  in  any  one
3376       match,  only  one of each set of identically-named groups participates.
3377       An example is shown in the pcre2pattern documentation.
3378
3379       When  duplicates   are   present,   pcre2_substring_copy_byname()   and
3380       pcre2_substring_get_byname()  return  the first substring corresponding
3381       to  the  given  name  that  is  set.  Only   if   none   are   set   is
3382       PCRE2_ERROR_UNSET  is  returned. The pcre2_substring_number_from_name()
3383       function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
3384       duplicate names.
3385
3386       If  you want to get full details of all captured substrings for a given
3387       name, you must use the pcre2_substring_nametable_scan()  function.  The
3388       first  argument is the compiled pattern, and the second is the name. If
3389       the third and fourth arguments are NULL, the function returns  a  group
3390       number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
3391
3392       When the third and fourth arguments are not NULL, they must be pointers
3393       to variables that are updated by the function. After it has  run,  they
3394       point to the first and last entries in the name-to-number table for the
3395       given name, and the function returns the length of each entry  in  code
3396       units.  In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
3397       no entries for the given name.
3398
3399       The format of the name table is described above in the section entitled
3400       Information  about  a  pattern.  Given all the relevant entries for the
3401       name, you can extract each of their numbers,  and  hence  the  captured
3402       data.
3403

FINDING ALL POSSIBLE MATCHES AT ONE POSITION

3405
3406       The  traditional  matching  function  uses a similar algorithm to Perl,
3407       which stops when it finds the first match at a given point in the  sub‐
3408       ject. If you want to find all possible matches, or the longest possible
3409       match at a given position,  consider  using  the  alternative  matching
3410       function  (see  below) instead. If you cannot use the alternative func‐
3411       tion, you can kludge it up by making use of the callout facility, which
3412       is described in the pcre2callout documentation.
3413
3414       What you have to do is to insert a callout right at the end of the pat‐
3415       tern.  When your callout function is called, extract and save the  cur‐
3416       rent  matched  substring.  Then return 1, which forces pcre2_match() to
3417       backtrack and try other alternatives. Ultimately, when it runs  out  of
3418       matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
3419

MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

3421
3422       int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
3423         PCRE2_SIZE length, PCRE2_SIZE startoffset,
3424         uint32_t options, pcre2_match_data *match_data,
3425         pcre2_match_context *mcontext,
3426         int *workspace, PCRE2_SIZE wscount);
3427
3428       The  function  pcre2_dfa_match()  is  called  to match a subject string
3429       against a compiled pattern, using a matching algorithm that  scans  the
3430       subject string just once (not counting lookaround assertions), and does
3431       not backtrack.  This has different characteristics to the normal  algo‐
3432       rithm,  and  is not compatible with Perl. Some of the features of PCRE2
3433       patterns are not supported.  Nevertheless, there are  times  when  this
3434       kind  of  matching  can be useful. For a discussion of the two matching
3435       algorithms, and a list of features that pcre2_dfa_match() does not sup‐
3436       port, see the pcre2matching documentation.
3437
3438       The  arguments  for  the pcre2_dfa_match() function are the same as for
3439       pcre2_match(), plus two extras. The ovector within the match data block
3440       is used in a different way, and this is described below. The other com‐
3441       mon arguments are used in the same way as for pcre2_match(),  so  their
3442       description is not repeated here.
3443
3444       The  two  additional  arguments provide workspace for the function. The
3445       workspace vector should contain at least 20 elements. It  is  used  for
3446       keeping  track  of  multiple  paths  through  the  pattern  tree.  More
3447       workspace is needed for patterns and subjects where there are a lot  of
3448       potential matches.
3449
3450       Here is an example of a simple call to pcre2_dfa_match():
3451
3452         int wspace[20];
3453         pcre2_match_data *md = pcre2_match_data_create(4, NULL);
3454         int rc = pcre2_dfa_match(
3455           re,             /* result of pcre2_compile() */
3456           "some string",  /* the subject string */
3457           11,             /* the length of the subject string */
3458           0,              /* start at offset 0 in the subject */
3459           0,              /* default options */
3460           md,             /* the match data block */
3461           NULL,           /* a match context; NULL means use defaults */
3462           wspace,         /* working space vector */
3463           20);            /* number of elements (NOT size in bytes) */
3464
3465   Option bits for pcre_dfa_match()
3466
3467       The  unused  bits of the options argument for pcre2_dfa_match() must be
3468       zero.  The  only   bits   that   may   be   set   are   PCRE2_ANCHORED,
3469       PCRE2_COPY_MATCHED_SUBJECT,       PCRE2_ENDANCHORED,      PCRE2_NOTBOL,
3470       PCRE2_NOTEOL,          PCRE2_NOTEMPTY,          PCRE2_NOTEMPTY_ATSTART,
3471       PCRE2_NO_UTF_CHECK,       PCRE2_PARTIAL_HARD,       PCRE2_PARTIAL_SOFT,
3472       PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but  the  last  four  of
3473       these  are  exactly the same as for pcre2_match(), so their description
3474       is not repeated here.
3475
3476         PCRE2_PARTIAL_HARD
3477         PCRE2_PARTIAL_SOFT
3478
3479       These have the same general effect as they do  for  pcre2_match(),  but
3480       the  details are slightly different. When PCRE2_PARTIAL_HARD is set for
3481       pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if  the  end  of  the
3482       subject is reached and there is still at least one matching possibility
3483       that requires additional characters. This happens even if some complete
3484       matches  have  already  been found. When PCRE2_PARTIAL_SOFT is set, the
3485       return code PCRE2_ERROR_NOMATCH is converted  into  PCRE2_ERROR_PARTIAL
3486       if  the  end  of  the  subject  is reached, there have been no complete
3487       matches, but there is still at least one matching possibility. The por‐
3488       tion  of  the  string that was inspected when the longest partial match
3489       was found is set as the first matching string in both cases. There is a
3490       more  detailed  discussion  of partial and multi-segment matching, with
3491       examples, in the pcre2partial documentation.
3492
3493         PCRE2_DFA_SHORTEST
3494
3495       Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm  to
3496       stop as soon as it has found one match. Because of the way the alterna‐
3497       tive algorithm works, this is necessarily the shortest  possible  match
3498       at the first possible matching point in the subject string.
3499
3500         PCRE2_DFA_RESTART
3501
3502       When  pcre2_dfa_match() returns a partial match, it is possible to call
3503       it again, with additional subject characters, and have it continue with
3504       the same match. The PCRE2_DFA_RESTART option requests this action; when
3505       it is set, the workspace and wscount options must  reference  the  same
3506       vector  as  before  because data about the match so far is left in them
3507       after a partial match. There is more discussion of this facility in the
3508       pcre2partial documentation.
3509
3510   Successful returns from pcre2_dfa_match()
3511
3512       When pcre2_dfa_match() succeeds, it may have matched more than one sub‐
3513       string in the subject. Note, however, that all the matches from one run
3514       of  the  function  start  at the same point in the subject. The shorter
3515       matches are all initial substrings of the longer matches. For  example,
3516       if the pattern
3517
3518         <.*>
3519
3520       is matched against the string
3521
3522         This is <something> <something else> <something further> no more
3523
3524       the three matched strings are
3525
3526         <something> <something else> <something further>
3527         <something> <something else>
3528         <something>
3529
3530       On  success,  the  yield of the function is a number greater than zero,
3531       which is the number of matched substrings.  The  offsets  of  the  sub‐
3532       strings  are returned in the ovector, and can be extracted by number in
3533       the same way as for pcre2_match(), but the numbers bear no relation  to
3534       any  capture groups that may exist in the pattern, because DFA matching
3535       does not support capturing.
3536
3537       Calls to the convenience functions  that  extract  substrings  by  name
3538       return  the  error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
3539       after a DFA match. The convenience functions that extract substrings by
3540       number never return PCRE2_ERROR_NOSUBSTRING.
3541
3542       The  matched  strings  are  stored  in  the ovector in reverse order of
3543       length; that is, the longest matching string is first.  If  there  were
3544       too  many matches to fit into the ovector, the yield of the function is
3545       zero, and the vector is filled with the longest matches.
3546
3547       NOTE: PCRE2's "auto-possessification" optimization usually  applies  to
3548       character  repeats at the end of a pattern (as well as internally). For
3549       example, the pattern "a\d+" is compiled as if it were "a\d++". For  DFA
3550       matching,  this  means  that  only  one possible match is found. If you
3551       really do want multiple matches in such cases, either use  an  ungreedy
3552       repeat  such  as  "a\d+?"  or set the PCRE2_NO_AUTO_POSSESS option when
3553       compiling.
3554
3555   Error returns from pcre2_dfa_match()
3556
3557       The pcre2_dfa_match() function returns a negative number when it fails.
3558       Many  of  the  errors  are  the same as for pcre2_match(), as described
3559       above.  There are in addition the following errors that are specific to
3560       pcre2_dfa_match():
3561
3562         PCRE2_ERROR_DFA_UITEM
3563
3564       This  return  is  given  if pcre2_dfa_match() encounters an item in the
3565       pattern that it does not support, for instance, the use of \C in a  UTF
3566       mode or a backreference.
3567
3568         PCRE2_ERROR_DFA_UCOND
3569
3570       This  return  is given if pcre2_dfa_match() encounters a condition item
3571       that uses a backreference for the condition, or a test for recursion in
3572       a specific capture group. These are not supported.
3573
3574         PCRE2_ERROR_DFA_UINVALID_UTF
3575
3576       This  return is given if pcre2_dfa_match() is called for a pattern that
3577       was compiled with PCRE2_MATCH_INVALID_UTF. This is  not  supported  for
3578       DFA matching.
3579
3580         PCRE2_ERROR_DFA_WSSIZE
3581
3582       This  return  is  given  if  pcre2_dfa_match() runs out of space in the
3583       workspace vector.
3584
3585         PCRE2_ERROR_DFA_RECURSE
3586
3587       When a recursion or subroutine call is processed, the matching function
3588       calls  itself  recursively,  using  private  memory for the ovector and
3589       workspace.  This error is given if the internal ovector  is  not  large
3590       enough.  This  should  be  extremely  rare, as a vector of size 1000 is
3591       used.
3592
3593         PCRE2_ERROR_DFA_BADRESTART
3594
3595       When pcre2_dfa_match() is called  with  the  PCRE2_DFA_RESTART  option,
3596       some  plausibility  checks  are  made on the contents of the workspace,
3597       which should contain data about the previous partial match. If  any  of
3598       these checks fail, this error is given.
3599

SEE ALSO

3601
3602       pcre2build(3),    pcre2callout(3),    pcre2demo(3),   pcre2matching(3),
3603       pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).
3604

AUTHOR

3606
3607       Philip Hazel
3608       University Computing Service
3609       Cambridge, England.
3610

REVISION

3612
3613       Last updated: 04 November 2020
3614       Copyright (c) 1997-2020 University of Cambridge.
3615
3616
3617
3618PCRE2 10.36                    04 November 2020                    PCRE2API(3)
Impressum