File Coverage

deps/libgit2/deps/pcre/pcre_compile.c
Criterion Covered Total %
statement 585 2530 23.1
branch 305 2214 13.7
condition n/a
subroutine n/a
pod n/a
total 890 4744 18.7


line stmt bran cond sub pod time code
1             /*************************************************
2             * Perl-Compatible Regular Expressions *
3             *************************************************/
4              
5             /* PCRE is a library of functions to support regular expressions whose syntax
6             and semantics are as close as possible to those of the Perl 5 language.
7              
8             Written by Philip Hazel
9             Copyright (c) 1997-2020 University of Cambridge
10              
11             -----------------------------------------------------------------------------
12             Redistribution and use in source and binary forms, with or without
13             modification, are permitted provided that the following conditions are met:
14              
15             * Redistributions of source code must retain the above copyright notice,
16             this list of conditions and the following disclaimer.
17              
18             * Redistributions in binary form must reproduce the above copyright
19             notice, this list of conditions and the following disclaimer in the
20             documentation and/or other materials provided with the distribution.
21              
22             * Neither the name of the University of Cambridge nor the names of its
23             contributors may be used to endorse or promote products derived from
24             this software without specific prior written permission.
25              
26             THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27             AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28             IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29             ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30             LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31             CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33             INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34             CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35             ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36             POSSIBILITY OF SUCH DAMAGE.
37             -----------------------------------------------------------------------------
38             */
39              
40              
41             /* This module contains the external function pcre_compile(), along with
42             supporting internal functions that are not used by other modules. */
43              
44              
45             #ifdef HAVE_CONFIG_H
46             #include "config.h"
47             #endif
48              
49             #define NLBLOCK cd /* Block containing newline information */
50             #define PSSTART start_pattern /* Field containing pattern start */
51             #define PSEND end_pattern /* Field containing pattern end */
52              
53             #include "pcre_internal.h"
54              
55              
56             /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57             is also used by pcretest. PCRE_DEBUG is not defined when building a production
58             library. We do not need to select pcre16_printint.c specially, because the
59             COMPILE_PCREx macro will already be appropriately set. */
60              
61             #ifdef PCRE_DEBUG
62             /* pcre_printint.c should not include any headers */
63             #define PCRE_INCLUDED
64             #include "pcre_printint.c"
65             #undef PCRE_INCLUDED
66             #endif
67              
68              
69             /* Macro for setting individual bits in class bitmaps. */
70              
71             #define SETBIT(a,b) a[(b)/8] |= (1U << ((b)&7))
72              
73             /* Maximum length value to check against when making sure that the integer that
74             holds the compiled pattern length does not overflow. We make it a bit less than
75             INT_MAX to allow for adding in group terminating bytes, so that we don't have
76             to check them every time. */
77              
78             #define OFLOW_MAX (INT_MAX - 20)
79              
80             /* Definitions to allow mutual recursion */
81              
82             static int
83             add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84             const pcre_uint32 *, unsigned int);
85              
86             static BOOL
87             compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88             pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89             compile_data *, int *);
90              
91              
92              
93             /*************************************************
94             * Code parameters and static tables *
95             *************************************************/
96              
97             /* This value specifies the size of stack workspace that is used during the
98             first pre-compile phase that determines how much memory is required. The regex
99             is partly compiled into this space, but the compiled parts are discarded as
100             soon as they can be, so that hopefully there will never be an overrun. The code
101             does, however, check for an overrun. The largest amount I've seen used is 218,
102             so this number is very generous.
103              
104             The same workspace is used during the second, actual compile phase for
105             remembering forward references to groups so that they can be filled in at the
106             end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107             is 4 there is plenty of room for most patterns. However, the memory can get
108             filled up by repetitions of forward references, for example patterns like
109             /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110             that the workspace is expanded using malloc() in this situation. The value
111             below is therefore a minimum, and we put a maximum on it for safety. The
112             minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113             kicks in at the same number of forward references in all cases. */
114              
115             #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116             #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117              
118             /* This value determines the size of the initial vector that is used for
119             remembering named groups during the pre-compile. It is allocated on the stack,
120             but if it is too small, it is expanded using malloc(), in a similar way to the
121             workspace. The value is the number of slots in the list. */
122              
123             #define NAMED_GROUP_LIST_SIZE 20
124              
125             /* The overrun tests check for a slightly smaller size so that they detect the
126             overrun before it actually does run off the end of the data block. */
127              
128             #define WORK_SIZE_SAFETY_MARGIN (100)
129              
130             /* Private flags added to firstchar and reqchar. */
131              
132             #define REQ_CASELESS (1U << 0) /* Indicates caselessness */
133             #define REQ_VARY (1U << 1) /* Reqchar followed non-literal item */
134             /* Negative values for the firstchar and reqchar flags */
135             #define REQ_UNSET (-2)
136             #define REQ_NONE (-1)
137              
138             /* Repeated character flags. */
139              
140             #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141              
142             /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143             are simple data values; negative values are for special things like \d and so
144             on. Zero means further processing is needed (for things like \x), or the escape
145             is invalid. */
146              
147             #ifndef EBCDIC
148              
149             /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150             in UTF-8 mode. */
151              
152             static const short int escapes[] = {
153             0, 0,
154             0, 0,
155             0, 0,
156             0, 0,
157             0, 0,
158             CHAR_COLON, CHAR_SEMICOLON,
159             CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160             CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161             CHAR_COMMERCIAL_AT, -ESC_A,
162             -ESC_B, -ESC_C,
163             -ESC_D, -ESC_E,
164             0, -ESC_G,
165             -ESC_H, 0,
166             0, -ESC_K,
167             0, 0,
168             -ESC_N, 0,
169             -ESC_P, -ESC_Q,
170             -ESC_R, -ESC_S,
171             0, 0,
172             -ESC_V, -ESC_W,
173             -ESC_X, 0,
174             -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175             CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176             CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177             CHAR_GRAVE_ACCENT, ESC_a,
178             -ESC_b, 0,
179             -ESC_d, ESC_e,
180             ESC_f, 0,
181             -ESC_h, 0,
182             0, -ESC_k,
183             0, 0,
184             ESC_n, 0,
185             -ESC_p, 0,
186             ESC_r, -ESC_s,
187             ESC_tee, 0,
188             -ESC_v, -ESC_w,
189             0, 0,
190             -ESC_z
191             };
192              
193             #else
194              
195             /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196              
197             static const short int escapes[] = {
198             /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199             /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200             /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201             /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202             /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203             /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204             /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205             /* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206             /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207             /* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
208             /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209             /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210             /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211             /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212             /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213             /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214             /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215             /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216             /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217             /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218             /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219             /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220             /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221             };
222              
223             /* We also need a table of characters that may follow \c in an EBCDIC
224             environment for characters 0-31. */
225              
226             static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
227              
228             #endif
229              
230              
231             /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
232             searched linearly. Put all the names into a single string, in order to reduce
233             the number of relocations when a shared library is dynamically linked. The
234             string is built from string macros so that it works in UTF-8 mode on EBCDIC
235             platforms. */
236              
237             typedef struct verbitem {
238             int len; /* Length of verb name */
239             int op; /* Op when no arg, or -1 if arg mandatory */
240             int op_arg; /* Op when arg present, or -1 if not allowed */
241             } verbitem;
242              
243             static const char verbnames[] =
244             "\0" /* Empty name is a shorthand for MARK */
245             STRING_MARK0
246             STRING_ACCEPT0
247             STRING_COMMIT0
248             STRING_F0
249             STRING_FAIL0
250             STRING_PRUNE0
251             STRING_SKIP0
252             STRING_THEN;
253              
254             static const verbitem verbs[] = {
255             { 0, -1, OP_MARK },
256             { 4, -1, OP_MARK },
257             { 6, OP_ACCEPT, -1 },
258             { 6, OP_COMMIT, -1 },
259             { 1, OP_FAIL, -1 },
260             { 4, OP_FAIL, -1 },
261             { 5, OP_PRUNE, OP_PRUNE_ARG },
262             { 4, OP_SKIP, OP_SKIP_ARG },
263             { 4, OP_THEN, OP_THEN_ARG }
264             };
265              
266             static const int verbcount = sizeof(verbs)/sizeof(verbitem);
267              
268              
269             /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
270             another regex library. */
271              
272             static const pcre_uchar sub_start_of_word[] = {
273             CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
274             CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
275              
276             static const pcre_uchar sub_end_of_word[] = {
277             CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
278             CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
279             CHAR_RIGHT_PARENTHESIS, '\0' };
280              
281              
282             /* Tables of names of POSIX character classes and their lengths. The names are
283             now all in a single string, to reduce the number of relocations when a shared
284             library is dynamically loaded. The list of lengths is terminated by a zero
285             length entry. The first three must be alpha, lower, upper, as this is assumed
286             for handling case independence. The indices for graph, print, and punct are
287             needed, so identify them. */
288              
289             static const char posix_names[] =
290             STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
291             STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
292             STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
293             STRING_word0 STRING_xdigit;
294              
295             static const pcre_uint8 posix_name_lengths[] = {
296             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
297              
298             #define PC_GRAPH 8
299             #define PC_PRINT 9
300             #define PC_PUNCT 10
301              
302              
303             /* Table of class bit maps for each POSIX class. Each class is formed from a
304             base map, with an optional addition or removal of another map. Then, for some
305             classes, there is some additional tweaking: for [:blank:] the vertical space
306             characters are removed, and for [:alpha:] and [:alnum:] the underscore
307             character is removed. The triples in the table consist of the base map offset,
308             second map offset or -1 if no second map, and a non-negative value for map
309             addition or a negative value for map subtraction (if there are two maps). The
310             absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
311             remove vertical space characters, 2 => remove underscore. */
312              
313             static const int posix_class_maps[] = {
314             cbit_word, cbit_digit, -2, /* alpha */
315             cbit_lower, -1, 0, /* lower */
316             cbit_upper, -1, 0, /* upper */
317             cbit_word, -1, 2, /* alnum - word without underscore */
318             cbit_print, cbit_cntrl, 0, /* ascii */
319             cbit_space, -1, 1, /* blank - a GNU extension */
320             cbit_cntrl, -1, 0, /* cntrl */
321             cbit_digit, -1, 0, /* digit */
322             cbit_graph, -1, 0, /* graph */
323             cbit_print, -1, 0, /* print */
324             cbit_punct, -1, 0, /* punct */
325             cbit_space, -1, 0, /* space */
326             cbit_word, -1, 0, /* word - a Perl extension */
327             cbit_xdigit,-1, 0 /* xdigit */
328             };
329              
330             /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
331             Unicode property escapes. */
332              
333             #ifdef SUPPORT_UCP
334             static const pcre_uchar string_PNd[] = {
335             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336             CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337             static const pcre_uchar string_pNd[] = {
338             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339             CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340             static const pcre_uchar string_PXsp[] = {
341             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342             CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343             static const pcre_uchar string_pXsp[] = {
344             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345             CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346             static const pcre_uchar string_PXwd[] = {
347             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
348             CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
349             static const pcre_uchar string_pXwd[] = {
350             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351             CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352              
353             static const pcre_uchar *substitutes[] = {
354             string_PNd, /* \D */
355             string_pNd, /* \d */
356             string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
357             string_pXsp, /* \s */ /* space and POSIX space are the same. */
358             string_PXwd, /* \W */
359             string_pXwd /* \w */
360             };
361              
362             /* The POSIX class substitutes must be in the order of the POSIX class names,
363             defined above, and there are both positive and negative cases. NULL means no
364             general substitute of a Unicode property escape (\p or \P). However, for some
365             POSIX classes (e.g. graph, print, punct) a special property code is compiled
366             directly. */
367              
368             static const pcre_uchar string_pL[] = {
369             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370             CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371             static const pcre_uchar string_pLl[] = {
372             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373             CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374             static const pcre_uchar string_pLu[] = {
375             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
376             CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
377             static const pcre_uchar string_pXan[] = {
378             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
379             CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
380             static const pcre_uchar string_h[] = {
381             CHAR_BACKSLASH, CHAR_h, '\0' };
382             static const pcre_uchar string_pXps[] = {
383             CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
384             CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385             static const pcre_uchar string_PL[] = {
386             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387             CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388             static const pcre_uchar string_PLl[] = {
389             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390             CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391             static const pcre_uchar string_PLu[] = {
392             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
393             CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
394             static const pcre_uchar string_PXan[] = {
395             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
396             CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
397             static const pcre_uchar string_H[] = {
398             CHAR_BACKSLASH, CHAR_H, '\0' };
399             static const pcre_uchar string_PXps[] = {
400             CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
401             CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
402              
403             static const pcre_uchar *posix_substitutes[] = {
404             string_pL, /* alpha */
405             string_pLl, /* lower */
406             string_pLu, /* upper */
407             string_pXan, /* alnum */
408             NULL, /* ascii */
409             string_h, /* blank */
410             NULL, /* cntrl */
411             string_pNd, /* digit */
412             NULL, /* graph */
413             NULL, /* print */
414             NULL, /* punct */
415             string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
416             string_pXwd, /* word */ /* Perl and POSIX space are the same */
417             NULL, /* xdigit */
418             /* Negated cases */
419             string_PL, /* ^alpha */
420             string_PLl, /* ^lower */
421             string_PLu, /* ^upper */
422             string_PXan, /* ^alnum */
423             NULL, /* ^ascii */
424             string_H, /* ^blank */
425             NULL, /* ^cntrl */
426             string_PNd, /* ^digit */
427             NULL, /* ^graph */
428             NULL, /* ^print */
429             NULL, /* ^punct */
430             string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
431             string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
432             NULL /* ^xdigit */
433             };
434             #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
435             #endif
436              
437             #define STRING(a) # a
438             #define XSTRING(s) STRING(s)
439              
440             /* The texts of compile-time error messages. These are "char *" because they
441             are passed to the outside world. Do not ever re-use any error number, because
442             they are documented. Always add a new error instead. Messages marked DEAD below
443             are no longer used. This used to be a table of strings, but in order to reduce
444             the number of relocations needed when a shared library is loaded dynamically,
445             it is now one long string. We cannot use a table of offsets, because the
446             lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
447             simply count through to the one we want - this isn't a performance issue
448             because these strings are used only when there is a compilation error.
449              
450             Each substring ends with \0 to insert a null character. This includes the final
451             substring, so that the whole string ends with \0\0, which can be detected when
452             counting through. */
453              
454             static const char error_texts[] =
455             "no error\0"
456             "\\ at end of pattern\0"
457             "\\c at end of pattern\0"
458             "unrecognized character follows \\\0"
459             "numbers out of order in {} quantifier\0"
460             /* 5 */
461             "number too big in {} quantifier\0"
462             "missing terminating ] for character class\0"
463             "invalid escape sequence in character class\0"
464             "range out of order in character class\0"
465             "nothing to repeat\0"
466             /* 10 */
467             "internal error: invalid forward reference offset\0"
468             "internal error: unexpected repeat\0"
469             "unrecognized character after (? or (?-\0"
470             "POSIX named classes are supported only within a class\0"
471             "missing )\0"
472             /* 15 */
473             "reference to non-existent subpattern\0"
474             "erroffset passed as NULL\0"
475             "unknown option bit(s) set\0"
476             "missing ) after comment\0"
477             "parentheses nested too deeply\0" /** DEAD **/
478             /* 20 */
479             "regular expression is too large\0"
480             "failed to get memory\0"
481             "unmatched parentheses\0"
482             "internal error: code overflow\0"
483             "unrecognized character after (?<\0"
484             /* 25 */
485             "lookbehind assertion is not fixed length\0"
486             "malformed number or name after (?(\0"
487             "conditional group contains more than two branches\0"
488             "assertion expected after (?( or (?(?C)\0"
489             "(?R or (?[+-]digits must be followed by )\0"
490             /* 30 */
491             "unknown POSIX class name\0"
492             "POSIX collating elements are not supported\0"
493             "this version of PCRE is compiled without UTF support\0"
494             "spare error\0" /** DEAD **/
495             "character value in \\x{} or \\o{} is too large\0"
496             /* 35 */
497             "invalid condition (?(0)\0"
498             "\\C not allowed in lookbehind assertion\0"
499             "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
500             "number after (?C is > 255\0"
501             "closing ) for (?C expected\0"
502             /* 40 */
503             "recursive call could loop indefinitely\0"
504             "unrecognized character after (?P\0"
505             "syntax error in subpattern name (missing terminator)\0"
506             "two named subpatterns have the same name\0"
507             "invalid UTF-8 string\0"
508             /* 45 */
509             "support for \\P, \\p, and \\X has not been compiled\0"
510             "malformed \\P or \\p sequence\0"
511             "unknown property name after \\P or \\p\0"
512             "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
513             "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
514             /* 50 */
515             "repeated subpattern is too long\0" /** DEAD **/
516             "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
517             "internal error: overran compiling workspace\0"
518             "internal error: previously-checked referenced subpattern not found\0"
519             "DEFINE group contains more than one branch\0"
520             /* 55 */
521             "repeating a DEFINE group is not allowed\0" /** DEAD **/
522             "inconsistent NEWLINE options\0"
523             "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
524             "a numbered reference must not be zero\0"
525             "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
526             /* 60 */
527             "(*VERB) not recognized or malformed\0"
528             "number is too big\0"
529             "subpattern name expected\0"
530             "digit expected after (?+\0"
531             "] is an invalid data character in JavaScript compatibility mode\0"
532             /* 65 */
533             "different names for subpatterns of the same number are not allowed\0"
534             "(*MARK) must have an argument\0"
535             "this version of PCRE is not compiled with Unicode property support\0"
536             #ifndef EBCDIC
537             "\\c must be followed by an ASCII character\0"
538             #else
539             "\\c must be followed by a letter or one of [\\]^_?\0"
540             #endif
541             "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
542             /* 70 */
543             "internal error: unknown opcode in find_fixedlength()\0"
544             "\\N is not supported in a class\0"
545             "too many forward references\0"
546             "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
547             "invalid UTF-16 string\0"
548             /* 75 */
549             "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
550             "character value in \\u.... sequence is too large\0"
551             "invalid UTF-32 string\0"
552             "setting UTF is disabled by the application\0"
553             "non-hex character in \\x{} (closing brace missing?)\0"
554             /* 80 */
555             "non-octal character in \\o{} (closing brace missing?)\0"
556             "missing opening brace after \\o\0"
557             "parentheses are too deeply nested\0"
558             "invalid range in character class\0"
559             "group name must start with a non-digit\0"
560             /* 85 */
561             "parentheses are too deeply nested (stack check)\0"
562             "digits missing in \\x{} or \\o{}\0"
563             "regular expression is too complicated\0"
564             ;
565              
566             /* Table to identify digits and hex digits. This is used when compiling
567             patterns. Note that the tables in chartables are dependent on the locale, and
568             may mark arbitrary characters as digits - but the PCRE compiling code expects
569             to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
570             a private table here. It costs 256 bytes, but it is a lot faster than doing
571             character value tests (at least in some simple cases I timed), and in some
572             applications one wants PCRE to compile efficiently as well as match
573             efficiently.
574              
575             For convenience, we use the same bit definitions as in chartables:
576              
577             0x04 decimal digit
578             0x08 hexadecimal digit
579              
580             Then we can use ctype_digit and ctype_xdigit in the code. */
581              
582             /* Using a simple comparison for decimal numbers rather than a memory read
583             is much faster, and the resulting code is simpler (the compiler turns it
584             into a subtraction and unsigned comparison). */
585              
586             #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
587              
588             #ifndef EBCDIC
589              
590             /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
591             UTF-8 mode. */
592              
593             static const pcre_uint8 digitab[] =
594             {
595             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
596             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
597             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
598             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
599             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
600             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
601             0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
602             0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
603             0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
604             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
605             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
606             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
607             0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
608             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
609             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
610             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
611             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
612             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
613             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
614             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
615             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
616             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
617             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
618             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
619             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
620             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
621             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
622             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
623             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
624             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
625             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
626             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
627              
628             #else
629              
630             /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
631              
632             static const pcre_uint8 digitab[] =
633             {
634             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
635             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
636             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
637             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
638             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
639             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
640             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
641             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
642             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
643             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
644             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
645             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
646             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
647             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
648             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
649             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
650             0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
651             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
652             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
653             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
654             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
655             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
656             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
657             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
658             0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
659             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
660             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
661             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
662             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
663             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
664             0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
665             0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
666              
667             static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
668             0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
669             0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
670             0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
671             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
672             0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
673             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
674             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
675             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
676             0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
677             0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
678             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
679             0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
680             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
681             0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
682             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
683             0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
684             0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
685             0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
686             0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
687             0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
688             0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
689             0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
690             0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
691             0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
692             0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
693             0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
694             0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
695             0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
696             0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
697             0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
698             0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
699             0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
700             #endif
701              
702              
703             /* This table is used to check whether auto-possessification is possible
704             between adjacent character-type opcodes. The left-hand (repeated) opcode is
705             used to select the row, and the right-hand opcode is use to select the column.
706             A value of 1 means that auto-possessification is OK. For example, the second
707             value in the first row means that \D+\d can be turned into \D++\d.
708              
709             The Unicode property types (\P and \p) have to be present to fill out the table
710             because of what their opcode values are, but the table values should always be
711             zero because property types are handled separately in the code. The last four
712             columns apply to items that cannot be repeated, so there is no need to have
713             rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
714             *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
715              
716             #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
717             #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
718              
719             static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
720             /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
721             { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
722             { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
723             { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
724             { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
725             { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
726             { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
727             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
728             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
729             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
730             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
731             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
732             { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
733             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
734             { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
735             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
736             { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
737             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
738             };
739              
740              
741             /* This table is used to check whether auto-possessification is possible
742             between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
743             left-hand (repeated) opcode is used to select the row, and the right-hand
744             opcode is used to select the column. The values are as follows:
745              
746             0 Always return FALSE (never auto-possessify)
747             1 Character groups are distinct (possessify if both are OP_PROP)
748             2 Check character categories in the same group (general or particular)
749             3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
750              
751             4 Check left general category vs right particular category
752             5 Check right general category vs left particular category
753              
754             6 Left alphanum vs right general category
755             7 Left space vs right general category
756             8 Left word vs right general category
757              
758             9 Right alphanum vs left general category
759             10 Right space vs left general category
760             11 Right word vs left general category
761              
762             12 Left alphanum vs right particular category
763             13 Left space vs right particular category
764             14 Left word vs right particular category
765              
766             15 Right alphanum vs left particular category
767             16 Right space vs left particular category
768             17 Right word vs left particular category
769             */
770              
771             static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
772             /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
773             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
774             { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
775             { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
776             { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
777             { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
778             { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
779             { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
780             { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
781             { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
782             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
783             { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
784             };
785              
786             /* This table is used to check whether auto-possessification is possible
787             between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
788             specifies a general category and the other specifies a particular category. The
789             row is selected by the general category and the column by the particular
790             category. The value is 1 if the particular category is not part of the general
791             category. */
792              
793             static const pcre_uint8 catposstab[7][30] = {
794             /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
795             { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
796             { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
797             { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
798             { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
799             { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
800             { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
801             { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
802             };
803              
804             /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
805             a general or particular category. The properties in each row are those
806             that apply to the character set in question. Duplication means that a little
807             unnecessary work is done when checking, but this keeps things much simpler
808             because they can all use the same code. For more details see the comment where
809             this table is used.
810              
811             Note: SPACE and PXSPACE used to be different because Perl excluded VT from
812             "space", but from Perl 5.18 it's included, so both categories are treated the
813             same here. */
814              
815             static const pcre_uint8 posspropstab[3][4] = {
816             { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
817             { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
818             { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
819             };
820              
821             /* This table is used when converting repeating opcodes into possessified
822             versions as a result of an explicit possessive quantifier such as ++. A zero
823             value means there is no possessified version - in those cases the item in
824             question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
825             because all relevant opcodes are less than that. */
826              
827             static const pcre_uint8 opcode_possessify[] = {
828             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
829             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
830              
831             0, /* NOTI */
832             OP_POSSTAR, 0, /* STAR, MINSTAR */
833             OP_POSPLUS, 0, /* PLUS, MINPLUS */
834             OP_POSQUERY, 0, /* QUERY, MINQUERY */
835             OP_POSUPTO, 0, /* UPTO, MINUPTO */
836             0, /* EXACT */
837             0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
838              
839             OP_POSSTARI, 0, /* STARI, MINSTARI */
840             OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
841             OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
842             OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
843             0, /* EXACTI */
844             0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
845              
846             OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
847             OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
848             OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
849             OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
850             0, /* NOTEXACT */
851             0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
852              
853             OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
854             OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
855             OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
856             OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
857             0, /* NOTEXACTI */
858             0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
859              
860             OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
861             OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
862             OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
863             OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
864             0, /* TYPEEXACT */
865             0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
866              
867             OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
868             OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
869             OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
870             OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
871             0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
872              
873             0, 0, 0, /* CLASS, NCLASS, XCLASS */
874             0, 0, /* REF, REFI */
875             0, 0, /* DNREF, DNREFI */
876             0, 0 /* RECURSE, CALLOUT */
877             };
878              
879              
880              
881             /*************************************************
882             * Find an error text *
883             *************************************************/
884              
885             /* The error texts are now all in one long string, to save on relocations. As
886             some of the text is of unknown length, we can't use a table of offsets.
887             Instead, just count through the strings. This is not a performance issue
888             because it happens only when there has been a compilation error.
889              
890             Argument: the error number
891             Returns: pointer to the error string
892             */
893              
894             static const char *
895 0           find_error_text(int n)
896             {
897 0           const char *s = error_texts;
898 0 0         for (; n > 0; n--)
899             {
900 0 0         while (*s++ != CHAR_NULL) {};
901 0 0         if (*s == CHAR_NULL) return "Error text not found (please report)";
902             }
903 0           return s;
904             }
905              
906              
907              
908             /*************************************************
909             * Expand the workspace *
910             *************************************************/
911              
912             /* This function is called during the second compiling phase, if the number of
913             forward references fills the existing workspace, which is originally a block on
914             the stack. A larger block is obtained from malloc() unless the ultimate limit
915             has been reached or the increase will be rather small.
916              
917             Argument: pointer to the compile data block
918             Returns: 0 if all went well, else an error number
919             */
920              
921             static int
922 0           expand_workspace(compile_data *cd)
923             {
924             pcre_uchar *newspace;
925 0           int newsize = cd->workspace_size * 2;
926              
927 0 0         if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
928 0 0         if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
    0          
929 0           newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
930 0           return ERR72;
931              
932 0           newspace = (PUBL(malloc))(IN_UCHARS(newsize));
933 0 0         if (newspace == NULL) return ERR21;
934 0           memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
935 0           cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
936 0 0         if (cd->workspace_size > COMPILE_WORK_SIZE)
937 0           (PUBL(free))((void *)cd->start_workspace);
938 0           cd->start_workspace = newspace;
939 0           cd->workspace_size = newsize;
940 0           return 0;
941             }
942              
943              
944              
945             /*************************************************
946             * Check for counted repeat *
947             *************************************************/
948              
949             /* This function is called when a '{' is encountered in a place where it might
950             start a quantifier. It looks ahead to see if it really is a quantifier or not.
951             It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
952             where the ddds are digits.
953              
954             Arguments:
955             p pointer to the first char after '{'
956              
957             Returns: TRUE or FALSE
958             */
959              
960             static BOOL
961 0           is_counted_repeat(const pcre_uchar *p)
962             {
963 0 0         if (!IS_DIGIT(*p)) return FALSE;
    0          
964 0           p++;
965 0 0         while (IS_DIGIT(*p)) p++;
    0          
966 0 0         if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
967              
968 0 0         if (*p++ != CHAR_COMMA) return FALSE;
969 0 0         if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
970              
971 0 0         if (!IS_DIGIT(*p)) return FALSE;
    0          
972 0           p++;
973 0 0         while (IS_DIGIT(*p)) p++;
    0          
974              
975 0           return (*p == CHAR_RIGHT_CURLY_BRACKET);
976             }
977              
978              
979              
980             /*************************************************
981             * Handle escapes *
982             *************************************************/
983              
984             /* This function is called when a \ has been encountered. It either returns a
985             positive value for a simple escape such as \n, or 0 for a data character which
986             will be placed in chptr. A backreference to group n is returned as negative n.
987             When UTF-8 is enabled, a positive value greater than 255 may be returned in
988             chptr. On entry, ptr is pointing at the \. On exit, it is on the final
989             character of the escape sequence.
990              
991             Arguments:
992             ptrptr points to the pattern position pointer
993             chptr points to a returned data character
994             errorcodeptr points to the errorcode variable
995             bracount number of previous extracting brackets
996             options the options bits
997             isclass TRUE if inside a character class
998              
999             Returns: zero => a data character
1000             positive => a special escape sequence
1001             negative => a back reference
1002             on error, errorcodeptr is set
1003             */
1004              
1005             static int
1006 164           check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1007             int bracount, int options, BOOL isclass)
1008             {
1009             /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1010 164           BOOL utf = (options & PCRE_UTF8) != 0;
1011 164           const pcre_uchar *ptr = *ptrptr + 1;
1012             pcre_uint32 c;
1013 164           int escape = 0;
1014             int i;
1015              
1016 164           GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1017 164           ptr--; /* Set pointer back to the last byte */
1018              
1019             /* If backslash is at the end of the pattern, it's an error. */
1020              
1021 164 50         if (c == CHAR_NULL) *errorcodeptr = ERR1;
1022              
1023             /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1024             in a table. A non-zero result is something that can be returned immediately.
1025             Otherwise further processing may be required. */
1026              
1027             #ifndef EBCDIC /* ASCII/UTF-8 coding */
1028             /* Not alphanumeric */
1029 164 50         else if (c < CHAR_0 || c > CHAR_z) {}
    0          
1030 0 0         else if ((i = escapes[c - CHAR_0]) != 0)
1031 0 0         { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1032              
1033             #else /* EBCDIC coding */
1034             /* Not alphanumeric */
1035             else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1036             else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1037             #endif
1038              
1039             /* Escapes that need further processing, or are illegal. */
1040              
1041             else
1042             {
1043             const pcre_uchar *oldptr;
1044             BOOL braced, negated, overflow;
1045             int s;
1046              
1047 0           switch (c)
1048             {
1049             /* A number of Perl escapes are not handled by PCRE. We give an explicit
1050             error. */
1051              
1052             case CHAR_l:
1053             case CHAR_L:
1054 0           *errorcodeptr = ERR37;
1055 0           break;
1056              
1057             case CHAR_u:
1058 0 0         if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1059             {
1060             /* In JavaScript, \u must be followed by four hexadecimal numbers.
1061             Otherwise it is a lowercase u letter. */
1062 0 0         if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1063 0           && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1064 0 0         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1065 0 0         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
    0          
1066             {
1067 0           c = 0;
1068 0 0         for (i = 0; i < 4; ++i)
1069             {
1070 0           register pcre_uint32 cc = *(++ptr);
1071             #ifndef EBCDIC /* ASCII/UTF-8 coding */
1072 0 0         if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1073 0 0         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1074             #else /* EBCDIC coding */
1075             if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1076             c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1077             #endif
1078             }
1079              
1080             #if defined COMPILE_PCRE8
1081 0 0         if (c > (utf ? 0x10ffffU : 0xffU))
    0          
1082             #elif defined COMPILE_PCRE16
1083             if (c > (utf ? 0x10ffffU : 0xffffU))
1084             #elif defined COMPILE_PCRE32
1085             if (utf && c > 0x10ffffU)
1086             #endif
1087             {
1088 0           *errorcodeptr = ERR76;
1089             }
1090 0 0         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
    0          
    0          
1091             }
1092             }
1093             else
1094 0           *errorcodeptr = ERR37;
1095 0           break;
1096              
1097             case CHAR_U:
1098             /* In JavaScript, \U is an uppercase U letter. */
1099 0 0         if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1100 0           break;
1101              
1102             /* In a character class, \g is just a literal "g". Outside a character
1103             class, \g must be followed by one of a number of specific things:
1104              
1105             (1) A number, either plain or braced. If positive, it is an absolute
1106             backreference. If negative, it is a relative backreference. This is a Perl
1107             5.10 feature.
1108              
1109             (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1110             is part of Perl's movement towards a unified syntax for back references. As
1111             this is synonymous with \k{name}, we fudge it up by pretending it really
1112             was \k.
1113              
1114             (3) For Oniguruma compatibility we also support \g followed by a name or a
1115             number either in angle brackets or in single quotes. However, these are
1116             (possibly recursive) subroutine calls, _not_ backreferences. Just return
1117             the ESC_g code (cf \k). */
1118              
1119             case CHAR_g:
1120 0 0         if (isclass) break;
1121 0 0         if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
    0          
1122             {
1123 0           escape = ESC_g;
1124 0           break;
1125             }
1126              
1127             /* Handle the Perl-compatible cases */
1128              
1129 0 0         if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1130             {
1131             const pcre_uchar *p;
1132 0 0         for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
    0          
1133 0 0         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
    0          
    0          
1134 0 0         if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
    0          
1135             {
1136 0           escape = ESC_k;
1137 0           break;
1138             }
1139 0           braced = TRUE;
1140 0           ptr++;
1141             }
1142 0           else braced = FALSE;
1143              
1144 0 0         if (ptr[1] == CHAR_MINUS)
1145             {
1146 0           negated = TRUE;
1147 0           ptr++;
1148             }
1149 0           else negated = FALSE;
1150              
1151             /* The integer range is limited by the machine's int representation. */
1152 0           s = 0;
1153 0           overflow = FALSE;
1154 0 0         while (IS_DIGIT(ptr[1]))
    0          
1155             {
1156 0 0         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1157             {
1158 0           overflow = TRUE;
1159 0           break;
1160             }
1161 0           s = s * 10 + (int)(*(++ptr) - CHAR_0);
1162             }
1163 0 0         if (overflow) /* Integer overflow */
1164             {
1165 0 0         while (IS_DIGIT(ptr[1]))
    0          
1166 0           ptr++;
1167 0           *errorcodeptr = ERR61;
1168 0           break;
1169             }
1170              
1171 0 0         if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
    0          
1172             {
1173 0           *errorcodeptr = ERR57;
1174 0           break;
1175             }
1176              
1177 0 0         if (s == 0)
1178             {
1179 0           *errorcodeptr = ERR58;
1180 0           break;
1181             }
1182              
1183 0 0         if (negated)
1184             {
1185 0 0         if (s > bracount)
1186             {
1187 0           *errorcodeptr = ERR15;
1188 0           break;
1189             }
1190 0           s = bracount - (s - 1);
1191             }
1192              
1193 0           escape = -s;
1194 0           break;
1195              
1196             /* The handling of escape sequences consisting of a string of digits
1197             starting with one that is not zero is not straightforward. Perl has changed
1198             over the years. Nowadays \g{} for backreferences and \o{} for octal are
1199             recommended to avoid the ambiguities in the old syntax.
1200              
1201             Outside a character class, the digits are read as a decimal number. If the
1202             number is less than 8 (used to be 10), or if there are that many previous
1203             extracting left brackets, then it is a back reference. Otherwise, up to
1204             three octal digits are read to form an escaped byte. Thus \123 is likely to
1205             be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1206             the octal value is greater than 377, the least significant 8 bits are
1207             taken. \8 and \9 are treated as the literal characters 8 and 9.
1208              
1209             Inside a character class, \ followed by a digit is always either a literal
1210             8 or 9 or an octal number. */
1211              
1212             case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1213             case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1214              
1215 0 0         if (!isclass)
1216             {
1217 0           oldptr = ptr;
1218             /* The integer range is limited by the machine's int representation. */
1219 0           s = (int)(c -CHAR_0);
1220 0           overflow = FALSE;
1221 0 0         while (IS_DIGIT(ptr[1]))
    0          
1222             {
1223 0 0         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1224             {
1225 0           overflow = TRUE;
1226 0           break;
1227             }
1228 0           s = s * 10 + (int)(*(++ptr) - CHAR_0);
1229             }
1230 0 0         if (overflow) /* Integer overflow */
1231             {
1232 0 0         while (IS_DIGIT(ptr[1]))
    0          
1233 0           ptr++;
1234 0           *errorcodeptr = ERR61;
1235 0           break;
1236             }
1237 0 0         if (s < 8 || s <= bracount) /* Check for back reference */
    0          
1238             {
1239 0           escape = -s;
1240 0           break;
1241             }
1242 0           ptr = oldptr; /* Put the pointer back and fall through */
1243             }
1244              
1245             /* Handle a digit following \ when the number is not a back reference. If
1246             the first digit is 8 or 9, Perl used to generate a binary zero byte and
1247             then treat the digit as a following literal. At least by Perl 5.18 this
1248             changed so as not to insert the binary zero. */
1249              
1250 0 0         if ((c = *ptr) >= CHAR_8) break;
1251              
1252             /* Fall through with a digit less than 8 */
1253              
1254             /* \0 always starts an octal number, but we may drop through to here with a
1255             larger first octal digit. The original code used just to take the least
1256             significant 8 bits of octal numbers (I think this is what early Perls used
1257             to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1258             but no more than 3 octal digits. */
1259              
1260             case CHAR_0:
1261 0           c -= CHAR_0;
1262 0 0         while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
    0          
    0          
1263 0           c = c * 8 + *(++ptr) - CHAR_0;
1264             #ifdef COMPILE_PCRE8
1265 0 0         if (!utf && c > 0xff) *errorcodeptr = ERR51;
    0          
1266             #endif
1267 0           break;
1268              
1269             /* \o is a relatively new Perl feature, supporting a more general way of
1270             specifying character codes in octal. The only supported form is \o{ddd}. */
1271              
1272             case CHAR_o:
1273 0 0         if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1274 0 0         if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1275             {
1276 0           ptr += 2;
1277 0           c = 0;
1278 0           overflow = FALSE;
1279 0 0         while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
    0          
1280             {
1281 0           register pcre_uint32 cc = *ptr++;
1282 0 0         if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
    0          
1283             #ifdef COMPILE_PCRE32
1284             if (c >= 0x20000000l) { overflow = TRUE; break; }
1285             #endif
1286 0           c = (c << 3) + cc - CHAR_0 ;
1287             #if defined COMPILE_PCRE8
1288 0 0         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
    0          
1289             #elif defined COMPILE_PCRE16
1290             if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1291             #elif defined COMPILE_PCRE32
1292             if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1293             #endif
1294             }
1295 0 0         if (overflow)
1296             {
1297 0 0         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
    0          
1298 0           *errorcodeptr = ERR34;
1299             }
1300 0 0         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1301             {
1302 0 0         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
    0          
    0          
1303             }
1304 0           else *errorcodeptr = ERR80;
1305             }
1306 0           break;
1307              
1308             /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1309             numbers. Otherwise it is a lowercase x letter. */
1310              
1311             case CHAR_x:
1312 0 0         if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1313             {
1314 0 0         if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1315 0 0         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1316             {
1317 0           c = 0;
1318 0 0         for (i = 0; i < 2; ++i)
1319             {
1320 0           register pcre_uint32 cc = *(++ptr);
1321             #ifndef EBCDIC /* ASCII/UTF-8 coding */
1322 0 0         if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1323 0 0         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1324             #else /* EBCDIC coding */
1325             if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1326             c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1327             #endif
1328             }
1329             }
1330             } /* End JavaScript handling */
1331              
1332             /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1333             greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1334             digits. If not, { used to be treated as a data character. However, Perl
1335             seems to read hex digits up to the first non-such, and ignore the rest, so
1336             that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1337             now gives an error. */
1338              
1339             else
1340             {
1341 0 0         if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1342             {
1343 0           ptr += 2;
1344 0 0         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1345             {
1346 0           *errorcodeptr = ERR86;
1347 0           break;
1348             }
1349 0           c = 0;
1350 0           overflow = FALSE;
1351 0 0         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1352             {
1353 0           register pcre_uint32 cc = *ptr++;
1354 0 0         if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
    0          
1355              
1356             #ifdef COMPILE_PCRE32
1357             if (c >= 0x10000000l) { overflow = TRUE; break; }
1358             #endif
1359              
1360             #ifndef EBCDIC /* ASCII/UTF-8 coding */
1361 0 0         if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1362 0 0         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1363             #else /* EBCDIC coding */
1364             if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1365             c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1366             #endif
1367              
1368             #if defined COMPILE_PCRE8
1369 0 0         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
    0          
1370             #elif defined COMPILE_PCRE16
1371             if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1372             #elif defined COMPILE_PCRE32
1373             if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1374             #endif
1375             }
1376              
1377 0 0         if (overflow)
1378             {
1379 0 0         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1380 0           *errorcodeptr = ERR34;
1381             }
1382              
1383 0 0         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1384             {
1385 0 0         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
    0          
    0          
1386             }
1387              
1388             /* If the sequence of hex digits does not end with '}', give an error.
1389             We used just to recognize this construct and fall through to the normal
1390             \x handling, but nowadays Perl gives an error, which seems much more
1391             sensible, so we do too. */
1392              
1393 0           else *errorcodeptr = ERR79;
1394             } /* End of \x{} processing */
1395              
1396             /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1397              
1398             else
1399             {
1400 0           c = 0;
1401 0 0         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
    0          
1402             {
1403             pcre_uint32 cc; /* Some compilers don't like */
1404 0           cc = *(++ptr); /* ++ in initializers */
1405             #ifndef EBCDIC /* ASCII/UTF-8 coding */
1406 0 0         if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1407 0 0         c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1408             #else /* EBCDIC coding */
1409             if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1410             c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1411             #endif
1412             }
1413             } /* End of \xdd handling */
1414             } /* End of Perl-style \x handling */
1415 0           break;
1416              
1417             /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1418             An error is given if the byte following \c is not an ASCII character. This
1419             coding is ASCII-specific, but then the whole concept of \cx is
1420             ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1421              
1422             case CHAR_c:
1423 0           c = *(++ptr);
1424 0 0         if (c == CHAR_NULL)
1425             {
1426 0           *errorcodeptr = ERR2;
1427 0           break;
1428             }
1429             #ifndef EBCDIC /* ASCII/UTF-8 coding */
1430 0 0         if (c > 127) /* Excludes all non-ASCII in either mode */
1431             {
1432 0           *errorcodeptr = ERR68;
1433 0           break;
1434             }
1435 0 0         if (c >= CHAR_a && c <= CHAR_z) c -= 32;
    0          
1436 0           c ^= 0x40;
1437             #else /* EBCDIC coding */
1438             if (c >= CHAR_a && c <= CHAR_z) c += 64;
1439             if (c == CHAR_QUESTION_MARK)
1440             c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1441             else
1442             {
1443             for (i = 0; i < 32; i++)
1444             {
1445             if (c == ebcdic_escape_c[i]) break;
1446             }
1447             if (i < 32) c = i; else *errorcodeptr = ERR68;
1448             }
1449             #endif
1450 0           break;
1451              
1452             /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1453             other alphanumeric following \ is an error if PCRE_EXTRA was set;
1454             otherwise, for Perl compatibility, it is a literal. This code looks a bit
1455             odd, but there used to be some cases other than the default, and there may
1456             be again in future, so I haven't "optimized" it. */
1457              
1458             default:
1459 0 0         if ((options & PCRE_EXTRA) != 0) switch(c)
1460             {
1461             default:
1462 0           *errorcodeptr = ERR3;
1463 0           break;
1464             }
1465 0           break;
1466             }
1467             }
1468              
1469             /* Perl supports \N{name} for character names, as well as plain \N for "not
1470             newline". PCRE does not support \N{name}. However, it does support
1471             quantification such as \N{2,3}. */
1472              
1473 164 50         if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1474 0           !is_counted_repeat(ptr+2))
1475 0           *errorcodeptr = ERR37;
1476              
1477             /* If PCRE_UCP is set, we change the values for \d etc. */
1478              
1479 164 50         if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
    0          
    0          
1480 0           escape += (ESC_DU - ESC_D);
1481              
1482             /* Set the pointer to the final character before returning. */
1483              
1484 164           *ptrptr = ptr;
1485 164           *chptr = c;
1486 164           return escape;
1487             }
1488              
1489              
1490              
1491             #ifdef SUPPORT_UCP
1492             /*************************************************
1493             * Handle \P and \p *
1494             *************************************************/
1495              
1496             /* This function is called after \P or \p has been encountered, provided that
1497             PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1498             pointing at the P or p. On exit, it is pointing at the final character of the
1499             escape sequence.
1500              
1501             Argument:
1502             ptrptr points to the pattern position pointer
1503             negptr points to a boolean that is set TRUE for negation else FALSE
1504             ptypeptr points to an unsigned int that is set to the type value
1505             pdataptr points to an unsigned int that is set to the detailed property value
1506             errorcodeptr points to the error code variable
1507              
1508             Returns: TRUE if the type value was found, or FALSE for an invalid type
1509             */
1510              
1511             static BOOL
1512             get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1513             unsigned int *pdataptr, int *errorcodeptr)
1514             {
1515             pcre_uchar c;
1516             int i, bot, top;
1517             const pcre_uchar *ptr = *ptrptr;
1518             pcre_uchar name[32];
1519              
1520             c = *(++ptr);
1521             if (c == CHAR_NULL) goto ERROR_RETURN;
1522              
1523             *negptr = FALSE;
1524              
1525             /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1526             negation. */
1527              
1528             if (c == CHAR_LEFT_CURLY_BRACKET)
1529             {
1530             if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1531             {
1532             *negptr = TRUE;
1533             ptr++;
1534             }
1535             for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1536             {
1537             c = *(++ptr);
1538             if (c == CHAR_NULL) goto ERROR_RETURN;
1539             if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1540             name[i] = c;
1541             }
1542             if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1543             name[i] = 0;
1544             }
1545              
1546             /* Otherwise there is just one following character */
1547              
1548             else
1549             {
1550             name[0] = c;
1551             name[1] = 0;
1552             }
1553              
1554             *ptrptr = ptr;
1555              
1556             /* Search for a recognized property name using binary chop */
1557              
1558             bot = 0;
1559             top = PRIV(utt_size);
1560              
1561             while (bot < top)
1562             {
1563             int r;
1564             i = (bot + top) >> 1;
1565             r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1566             if (r == 0)
1567             {
1568             *ptypeptr = PRIV(utt)[i].type;
1569             *pdataptr = PRIV(utt)[i].value;
1570             return TRUE;
1571             }
1572             if (r > 0) bot = i + 1; else top = i;
1573             }
1574              
1575             *errorcodeptr = ERR47;
1576             *ptrptr = ptr;
1577             return FALSE;
1578              
1579             ERROR_RETURN:
1580             *errorcodeptr = ERR46;
1581             *ptrptr = ptr;
1582             return FALSE;
1583             }
1584             #endif
1585              
1586              
1587              
1588             /*************************************************
1589             * Read repeat counts *
1590             *************************************************/
1591              
1592             /* Read an item of the form {n,m} and return the values. This is called only
1593             after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1594             so the syntax is guaranteed to be correct, but we need to check the values.
1595              
1596             Arguments:
1597             p pointer to first char after '{'
1598             minp pointer to int for min
1599             maxp pointer to int for max
1600             returned as -1 if no max
1601             errorcodeptr points to error code variable
1602              
1603             Returns: pointer to '}' on success;
1604             current ptr on error, with errorcodeptr set non-zero
1605             */
1606              
1607             static const pcre_uchar *
1608 0           read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1609             {
1610 0           int min = 0;
1611 0           int max = -1;
1612              
1613 0 0         while (IS_DIGIT(*p))
    0          
1614             {
1615 0           min = min * 10 + (int)(*p++ - CHAR_0);
1616 0 0         if (min > 65535)
1617             {
1618 0           *errorcodeptr = ERR5;
1619 0           return p;
1620             }
1621             }
1622              
1623 0 0         if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1624             {
1625 0 0         if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1626             {
1627 0           max = 0;
1628 0 0         while(IS_DIGIT(*p))
    0          
1629             {
1630 0           max = max * 10 + (int)(*p++ - CHAR_0);
1631 0 0         if (max > 65535)
1632             {
1633 0           *errorcodeptr = ERR5;
1634 0           return p;
1635             }
1636             }
1637 0 0         if (max < min)
1638             {
1639 0           *errorcodeptr = ERR4;
1640 0           return p;
1641             }
1642             }
1643             }
1644              
1645 0           *minp = min;
1646 0           *maxp = max;
1647 0           return p;
1648             }
1649              
1650              
1651              
1652             /*************************************************
1653             * Find first significant op code *
1654             *************************************************/
1655              
1656             /* This is called by several functions that scan a compiled expression looking
1657             for a fixed first character, or an anchoring op code etc. It skips over things
1658             that do not influence this. For some calls, it makes sense to skip negative
1659             forward and all backward assertions, and also the \b assertion; for others it
1660             does not.
1661              
1662             Arguments:
1663             code pointer to the start of the group
1664             skipassert TRUE if certain assertions are to be skipped
1665              
1666             Returns: pointer to the first significant opcode
1667             */
1668              
1669             static const pcre_uchar*
1670 111           first_significant_code(const pcre_uchar *code, BOOL skipassert)
1671             {
1672             for (;;)
1673             {
1674 111           switch ((int)*code)
1675             {
1676             case OP_ASSERT_NOT:
1677             case OP_ASSERTBACK:
1678             case OP_ASSERTBACK_NOT:
1679 0 0         if (!skipassert) return code;
1680 0 0         do code += GET(code, 1); while (*code == OP_ALT);
1681 0           code += PRIV(OP_lengths)[*code];
1682 0           break;
1683              
1684             case OP_WORD_BOUNDARY:
1685             case OP_NOT_WORD_BOUNDARY:
1686 0 0         if (!skipassert) return code;
1687             /* Fall through */
1688              
1689             case OP_CALLOUT:
1690             case OP_CREF:
1691             case OP_DNCREF:
1692             case OP_RREF:
1693             case OP_DNRREF:
1694             case OP_DEF:
1695 0           code += PRIV(OP_lengths)[*code];
1696 0           break;
1697              
1698             default:
1699 111           return code;
1700             }
1701 0           }
1702             /* Control never reaches here */
1703             }
1704              
1705              
1706              
1707             /*************************************************
1708             * Find the fixed length of a branch *
1709             *************************************************/
1710              
1711             /* Scan a branch and compute the fixed length of subject that will match it,
1712             if the length is fixed. This is needed for dealing with backward assertions.
1713             In UTF8 mode, the result is in characters rather than bytes. The branch is
1714             temporarily terminated with OP_END when this function is called.
1715              
1716             This function is called when a backward assertion is encountered, so that if it
1717             fails, the error message can point to the correct place in the pattern.
1718             However, we cannot do this when the assertion contains subroutine calls,
1719             because they can be forward references. We solve this by remembering this case
1720             and doing the check at the end; a flag specifies which mode we are running in.
1721              
1722             Arguments:
1723             code points to the start of the pattern (the bracket)
1724             utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1725             atend TRUE if called when the pattern is complete
1726             cd the "compile data" structure
1727             recurses chain of recurse_check to catch mutual recursion
1728              
1729             Returns: the fixed length,
1730             or -1 if there is no fixed length,
1731             or -2 if \C was encountered (in UTF-8 mode only)
1732             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1733             or -4 if an unknown opcode was encountered (internal error)
1734             */
1735              
1736             static int
1737 0           find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1738             recurse_check *recurses)
1739             {
1740 0           int length = -1;
1741             recurse_check this_recurse;
1742 0           register int branchlength = 0;
1743 0           register pcre_uchar *cc = code + 1 + LINK_SIZE;
1744              
1745             /* Scan along the opcodes for this branch. If we get to the end of the
1746             branch, check the length against that of the other branches. */
1747              
1748             for (;;)
1749             {
1750             int d;
1751             pcre_uchar *ce, *cs;
1752 0           register pcre_uchar op = *cc;
1753              
1754 0           switch (op)
1755             {
1756             /* We only need to continue for OP_CBRA (normal capturing bracket) and
1757             OP_BRA (normal non-capturing bracket) because the other variants of these
1758             opcodes are all concerned with unlimited repeated groups, which of course
1759             are not of fixed length. */
1760              
1761             case OP_CBRA:
1762             case OP_BRA:
1763             case OP_ONCE:
1764             case OP_ONCE_NC:
1765             case OP_COND:
1766 0 0         d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1767             recurses);
1768 0 0         if (d < 0) return d;
1769 0           branchlength += d;
1770 0 0         do cc += GET(cc, 1); while (*cc == OP_ALT);
1771 0           cc += 1 + LINK_SIZE;
1772 0           break;
1773              
1774             /* Reached end of a branch; if it's a ket it is the end of a nested call.
1775             If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1776             an ALT. If it is END it's the end of the outer call. All can be handled by
1777             the same code. Note that we must not include the OP_KETRxxx opcodes here,
1778             because they all imply an unlimited repeat. */
1779              
1780             case OP_ALT:
1781             case OP_KET:
1782             case OP_END:
1783             case OP_ACCEPT:
1784             case OP_ASSERT_ACCEPT:
1785 0 0         if (length < 0) length = branchlength;
1786 0 0         else if (length != branchlength) return -1;
1787 0 0         if (*cc != OP_ALT) return length;
1788 0           cc += 1 + LINK_SIZE;
1789 0           branchlength = 0;
1790 0           break;
1791              
1792             /* A true recursion implies not fixed length, but a subroutine call may
1793             be OK. If the subroutine is a forward reference, we can't deal with
1794             it until the end of the pattern, so return -3. */
1795              
1796             case OP_RECURSE:
1797 0 0         if (!atend) return -3;
1798 0           cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1799 0 0         do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1800 0 0         if (cc > cs && cc < ce) return -1; /* Recursion */
    0          
1801             else /* Check for mutual recursion */
1802             {
1803 0           recurse_check *r = recurses;
1804 0 0         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    0          
1805 0 0         if (r != NULL) return -1; /* Mutual recursion */
1806             }
1807 0           this_recurse.prev = recurses;
1808 0           this_recurse.group = cs;
1809 0           d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1810 0 0         if (d < 0) return d;
1811 0           branchlength += d;
1812 0           cc += 1 + LINK_SIZE;
1813 0           break;
1814              
1815             /* Skip over assertive subpatterns */
1816              
1817             case OP_ASSERT:
1818             case OP_ASSERT_NOT:
1819             case OP_ASSERTBACK:
1820             case OP_ASSERTBACK_NOT:
1821 0 0         do cc += GET(cc, 1); while (*cc == OP_ALT);
1822 0           cc += 1 + LINK_SIZE;
1823 0           break;
1824              
1825             /* Skip over things that don't match chars */
1826              
1827             case OP_MARK:
1828             case OP_PRUNE_ARG:
1829             case OP_SKIP_ARG:
1830             case OP_THEN_ARG:
1831 0           cc += cc[1] + PRIV(OP_lengths)[*cc];
1832 0           break;
1833              
1834             case OP_CALLOUT:
1835             case OP_CIRC:
1836             case OP_CIRCM:
1837             case OP_CLOSE:
1838             case OP_COMMIT:
1839             case OP_CREF:
1840             case OP_DEF:
1841             case OP_DNCREF:
1842             case OP_DNRREF:
1843             case OP_DOLL:
1844             case OP_DOLLM:
1845             case OP_EOD:
1846             case OP_EODN:
1847             case OP_FAIL:
1848             case OP_NOT_WORD_BOUNDARY:
1849             case OP_PRUNE:
1850             case OP_REVERSE:
1851             case OP_RREF:
1852             case OP_SET_SOM:
1853             case OP_SKIP:
1854             case OP_SOD:
1855             case OP_SOM:
1856             case OP_THEN:
1857             case OP_WORD_BOUNDARY:
1858 0           cc += PRIV(OP_lengths)[*cc];
1859 0           break;
1860              
1861             /* Handle literal characters */
1862              
1863             case OP_CHAR:
1864             case OP_CHARI:
1865             case OP_NOT:
1866             case OP_NOTI:
1867 0           branchlength++;
1868 0           cc += 2;
1869             #ifdef SUPPORT_UTF
1870             if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1871             #endif
1872 0           break;
1873              
1874             /* Handle exact repetitions. The count is already in characters, but we
1875             need to skip over a multibyte character in UTF8 mode. */
1876              
1877             case OP_EXACT:
1878             case OP_EXACTI:
1879             case OP_NOTEXACT:
1880             case OP_NOTEXACTI:
1881 0           branchlength += (int)GET2(cc,1);
1882 0           cc += 2 + IMM2_SIZE;
1883             #ifdef SUPPORT_UTF
1884             if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1885             #endif
1886 0           break;
1887              
1888             case OP_TYPEEXACT:
1889 0           branchlength += GET2(cc,1);
1890 0 0         if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
    0          
1891 0           cc += 2;
1892 0           cc += 1 + IMM2_SIZE + 1;
1893 0           break;
1894              
1895             /* Handle single-char matchers */
1896              
1897             case OP_PROP:
1898             case OP_NOTPROP:
1899 0           cc += 2;
1900             /* Fall through */
1901              
1902             case OP_HSPACE:
1903             case OP_VSPACE:
1904             case OP_NOT_HSPACE:
1905             case OP_NOT_VSPACE:
1906             case OP_NOT_DIGIT:
1907             case OP_DIGIT:
1908             case OP_NOT_WHITESPACE:
1909             case OP_WHITESPACE:
1910             case OP_NOT_WORDCHAR:
1911             case OP_WORDCHAR:
1912             case OP_ANY:
1913             case OP_ALLANY:
1914 0           branchlength++;
1915 0           cc++;
1916 0           break;
1917              
1918             /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1919             otherwise \C is coded as OP_ALLANY. */
1920              
1921             case OP_ANYBYTE:
1922 0           return -2;
1923              
1924             /* Check a class for variable quantification */
1925              
1926             case OP_CLASS:
1927             case OP_NCLASS:
1928             #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1929             case OP_XCLASS:
1930             /* The original code caused an unsigned overflow in 64 bit systems,
1931             so now we use a conditional statement. */
1932             if (op == OP_XCLASS)
1933             cc += GET(cc, 1);
1934             else
1935             cc += PRIV(OP_lengths)[OP_CLASS];
1936             #else
1937 0           cc += PRIV(OP_lengths)[OP_CLASS];
1938             #endif
1939              
1940 0           switch (*cc)
1941             {
1942             case OP_CRSTAR:
1943             case OP_CRMINSTAR:
1944             case OP_CRPLUS:
1945             case OP_CRMINPLUS:
1946             case OP_CRQUERY:
1947             case OP_CRMINQUERY:
1948             case OP_CRPOSSTAR:
1949             case OP_CRPOSPLUS:
1950             case OP_CRPOSQUERY:
1951 0           return -1;
1952              
1953             case OP_CRRANGE:
1954             case OP_CRMINRANGE:
1955             case OP_CRPOSRANGE:
1956 0 0         if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1957 0           branchlength += (int)GET2(cc,1);
1958 0           cc += 1 + 2 * IMM2_SIZE;
1959 0           break;
1960              
1961             default:
1962 0           branchlength++;
1963             }
1964 0           break;
1965              
1966             /* Anything else is variable length */
1967              
1968             case OP_ANYNL:
1969             case OP_BRAMINZERO:
1970             case OP_BRAPOS:
1971             case OP_BRAPOSZERO:
1972             case OP_BRAZERO:
1973             case OP_CBRAPOS:
1974             case OP_EXTUNI:
1975             case OP_KETRMAX:
1976             case OP_KETRMIN:
1977             case OP_KETRPOS:
1978             case OP_MINPLUS:
1979             case OP_MINPLUSI:
1980             case OP_MINQUERY:
1981             case OP_MINQUERYI:
1982             case OP_MINSTAR:
1983             case OP_MINSTARI:
1984             case OP_MINUPTO:
1985             case OP_MINUPTOI:
1986             case OP_NOTMINPLUS:
1987             case OP_NOTMINPLUSI:
1988             case OP_NOTMINQUERY:
1989             case OP_NOTMINQUERYI:
1990             case OP_NOTMINSTAR:
1991             case OP_NOTMINSTARI:
1992             case OP_NOTMINUPTO:
1993             case OP_NOTMINUPTOI:
1994             case OP_NOTPLUS:
1995             case OP_NOTPLUSI:
1996             case OP_NOTPOSPLUS:
1997             case OP_NOTPOSPLUSI:
1998             case OP_NOTPOSQUERY:
1999             case OP_NOTPOSQUERYI:
2000             case OP_NOTPOSSTAR:
2001             case OP_NOTPOSSTARI:
2002             case OP_NOTPOSUPTO:
2003             case OP_NOTPOSUPTOI:
2004             case OP_NOTQUERY:
2005             case OP_NOTQUERYI:
2006             case OP_NOTSTAR:
2007             case OP_NOTSTARI:
2008             case OP_NOTUPTO:
2009             case OP_NOTUPTOI:
2010             case OP_PLUS:
2011             case OP_PLUSI:
2012             case OP_POSPLUS:
2013             case OP_POSPLUSI:
2014             case OP_POSQUERY:
2015             case OP_POSQUERYI:
2016             case OP_POSSTAR:
2017             case OP_POSSTARI:
2018             case OP_POSUPTO:
2019             case OP_POSUPTOI:
2020             case OP_QUERY:
2021             case OP_QUERYI:
2022             case OP_REF:
2023             case OP_REFI:
2024             case OP_DNREF:
2025             case OP_DNREFI:
2026             case OP_SBRA:
2027             case OP_SBRAPOS:
2028             case OP_SCBRA:
2029             case OP_SCBRAPOS:
2030             case OP_SCOND:
2031             case OP_SKIPZERO:
2032             case OP_STAR:
2033             case OP_STARI:
2034             case OP_TYPEMINPLUS:
2035             case OP_TYPEMINQUERY:
2036             case OP_TYPEMINSTAR:
2037             case OP_TYPEMINUPTO:
2038             case OP_TYPEPLUS:
2039             case OP_TYPEPOSPLUS:
2040             case OP_TYPEPOSQUERY:
2041             case OP_TYPEPOSSTAR:
2042             case OP_TYPEPOSUPTO:
2043             case OP_TYPEQUERY:
2044             case OP_TYPESTAR:
2045             case OP_TYPEUPTO:
2046             case OP_UPTO:
2047             case OP_UPTOI:
2048 0           return -1;
2049              
2050             /* Catch unrecognized opcodes so that when new ones are added they
2051             are not forgotten, as has happened in the past. */
2052              
2053             default:
2054 0           return -4;
2055             }
2056 0           }
2057             /* Control never gets here */
2058             }
2059              
2060              
2061              
2062             /*************************************************
2063             * Scan compiled regex for specific bracket *
2064             *************************************************/
2065              
2066             /* This little function scans through a compiled pattern until it finds a
2067             capturing bracket with the given number, or, if the number is negative, an
2068             instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2069             so that it can be called from pcre_study() when finding the minimum matching
2070             length.
2071              
2072             Arguments:
2073             code points to start of expression
2074             utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2075             number the required bracket number or negative to find a lookbehind
2076              
2077             Returns: pointer to the opcode for the bracket, or NULL if not found
2078             */
2079              
2080             const pcre_uchar *
2081 0           PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2082             {
2083             for (;;)
2084             {
2085 0           register pcre_uchar c = *code;
2086              
2087 0 0         if (c == OP_END) return NULL;
2088              
2089             /* XCLASS is used for classes that cannot be represented just by a bit
2090             map. This includes negated single high-valued characters. The length in
2091             the table is zero; the actual length is stored in the compiled code. */
2092              
2093 0 0         if (c == OP_XCLASS) code += GET(code, 1);
2094              
2095             /* Handle recursion */
2096              
2097 0 0         else if (c == OP_REVERSE)
2098             {
2099 0 0         if (number < 0) return (pcre_uchar *)code;
2100 0           code += PRIV(OP_lengths)[c];
2101             }
2102              
2103             /* Handle capturing bracket */
2104              
2105 0 0         else if (c == OP_CBRA || c == OP_SCBRA ||
    0          
    0          
2106 0 0         c == OP_CBRAPOS || c == OP_SCBRAPOS)
2107 0           {
2108 0           int n = (int)GET2(code, 1+LINK_SIZE);
2109 0 0         if (n == number) return (pcre_uchar *)code;
2110 0           code += PRIV(OP_lengths)[c];
2111             }
2112              
2113             /* Otherwise, we can get the item's length from the table, except that for
2114             repeated character types, we have to test for \p and \P, which have an extra
2115             two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2116             must add in its length. */
2117              
2118             else
2119             {
2120 0           switch(c)
2121             {
2122             case OP_TYPESTAR:
2123             case OP_TYPEMINSTAR:
2124             case OP_TYPEPLUS:
2125             case OP_TYPEMINPLUS:
2126             case OP_TYPEQUERY:
2127             case OP_TYPEMINQUERY:
2128             case OP_TYPEPOSSTAR:
2129             case OP_TYPEPOSPLUS:
2130             case OP_TYPEPOSQUERY:
2131 0 0         if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    0          
2132 0           break;
2133              
2134             case OP_TYPEUPTO:
2135             case OP_TYPEMINUPTO:
2136             case OP_TYPEEXACT:
2137             case OP_TYPEPOSUPTO:
2138 0 0         if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
    0          
2139 0           code += 2;
2140 0           break;
2141              
2142             case OP_MARK:
2143             case OP_PRUNE_ARG:
2144             case OP_SKIP_ARG:
2145             case OP_THEN_ARG:
2146 0           code += code[1];
2147 0           break;
2148             }
2149              
2150             /* Add in the fixed length from the table */
2151              
2152 0           code += PRIV(OP_lengths)[c];
2153              
2154             /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2155             a multi-byte character. The length in the table is a minimum, so we have to
2156             arrange to skip the extra bytes. */
2157              
2158             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2159             if (utf) switch(c)
2160             {
2161             case OP_CHAR:
2162             case OP_CHARI:
2163             case OP_NOT:
2164             case OP_NOTI:
2165             case OP_EXACT:
2166             case OP_EXACTI:
2167             case OP_NOTEXACT:
2168             case OP_NOTEXACTI:
2169             case OP_UPTO:
2170             case OP_UPTOI:
2171             case OP_NOTUPTO:
2172             case OP_NOTUPTOI:
2173             case OP_MINUPTO:
2174             case OP_MINUPTOI:
2175             case OP_NOTMINUPTO:
2176             case OP_NOTMINUPTOI:
2177             case OP_POSUPTO:
2178             case OP_POSUPTOI:
2179             case OP_NOTPOSUPTO:
2180             case OP_NOTPOSUPTOI:
2181             case OP_STAR:
2182             case OP_STARI:
2183             case OP_NOTSTAR:
2184             case OP_NOTSTARI:
2185             case OP_MINSTAR:
2186             case OP_MINSTARI:
2187             case OP_NOTMINSTAR:
2188             case OP_NOTMINSTARI:
2189             case OP_POSSTAR:
2190             case OP_POSSTARI:
2191             case OP_NOTPOSSTAR:
2192             case OP_NOTPOSSTARI:
2193             case OP_PLUS:
2194             case OP_PLUSI:
2195             case OP_NOTPLUS:
2196             case OP_NOTPLUSI:
2197             case OP_MINPLUS:
2198             case OP_MINPLUSI:
2199             case OP_NOTMINPLUS:
2200             case OP_NOTMINPLUSI:
2201             case OP_POSPLUS:
2202             case OP_POSPLUSI:
2203             case OP_NOTPOSPLUS:
2204             case OP_NOTPOSPLUSI:
2205             case OP_QUERY:
2206             case OP_QUERYI:
2207             case OP_NOTQUERY:
2208             case OP_NOTQUERYI:
2209             case OP_MINQUERY:
2210             case OP_MINQUERYI:
2211             case OP_NOTMINQUERY:
2212             case OP_NOTMINQUERYI:
2213             case OP_POSQUERY:
2214             case OP_POSQUERYI:
2215             case OP_NOTPOSQUERY:
2216             case OP_NOTPOSQUERYI:
2217             if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2218             break;
2219             }
2220             #else
2221             (void)(utf); /* Keep compiler happy by referencing function argument */
2222             #endif
2223             }
2224 0           }
2225             }
2226              
2227              
2228              
2229             /*************************************************
2230             * Scan compiled regex for recursion reference *
2231             *************************************************/
2232              
2233             /* This little function scans through a compiled pattern until it finds an
2234             instance of OP_RECURSE.
2235              
2236             Arguments:
2237             code points to start of expression
2238             utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2239              
2240             Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2241             */
2242              
2243             static const pcre_uchar *
2244 6           find_recurse(const pcre_uchar *code, BOOL utf)
2245             {
2246             for (;;)
2247             {
2248 30           register pcre_uchar c = *code;
2249 30 100         if (c == OP_END) return NULL;
2250 24 50         if (c == OP_RECURSE) return code;
2251              
2252             /* XCLASS is used for classes that cannot be represented just by a bit
2253             map. This includes negated single high-valued characters. The length in
2254             the table is zero; the actual length is stored in the compiled code. */
2255              
2256 24 50         if (c == OP_XCLASS) code += GET(code, 1);
2257              
2258             /* Otherwise, we can get the item's length from the table, except that for
2259             repeated character types, we have to test for \p and \P, which have an extra
2260             two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2261             must add in its length. */
2262              
2263             else
2264             {
2265 24           switch(c)
2266             {
2267             case OP_TYPESTAR:
2268             case OP_TYPEMINSTAR:
2269             case OP_TYPEPLUS:
2270             case OP_TYPEMINPLUS:
2271             case OP_TYPEQUERY:
2272             case OP_TYPEMINQUERY:
2273             case OP_TYPEPOSSTAR:
2274             case OP_TYPEPOSPLUS:
2275             case OP_TYPEPOSQUERY:
2276 0 0         if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    0          
2277 0           break;
2278              
2279             case OP_TYPEPOSUPTO:
2280             case OP_TYPEUPTO:
2281             case OP_TYPEMINUPTO:
2282             case OP_TYPEEXACT:
2283 0 0         if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
    0          
2284 0           code += 2;
2285 0           break;
2286              
2287             case OP_MARK:
2288             case OP_PRUNE_ARG:
2289             case OP_SKIP_ARG:
2290             case OP_THEN_ARG:
2291 0           code += code[1];
2292 0           break;
2293             }
2294              
2295             /* Add in the fixed length from the table */
2296              
2297 24           code += PRIV(OP_lengths)[c];
2298              
2299             /* In UTF-8 mode, opcodes that are followed by a character may be followed
2300             by a multi-byte character. The length in the table is a minimum, so we have
2301             to arrange to skip the extra bytes. */
2302              
2303             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2304             if (utf) switch(c)
2305             {
2306             case OP_CHAR:
2307             case OP_CHARI:
2308             case OP_NOT:
2309             case OP_NOTI:
2310             case OP_EXACT:
2311             case OP_EXACTI:
2312             case OP_NOTEXACT:
2313             case OP_NOTEXACTI:
2314             case OP_UPTO:
2315             case OP_UPTOI:
2316             case OP_NOTUPTO:
2317             case OP_NOTUPTOI:
2318             case OP_MINUPTO:
2319             case OP_MINUPTOI:
2320             case OP_NOTMINUPTO:
2321             case OP_NOTMINUPTOI:
2322             case OP_POSUPTO:
2323             case OP_POSUPTOI:
2324             case OP_NOTPOSUPTO:
2325             case OP_NOTPOSUPTOI:
2326             case OP_STAR:
2327             case OP_STARI:
2328             case OP_NOTSTAR:
2329             case OP_NOTSTARI:
2330             case OP_MINSTAR:
2331             case OP_MINSTARI:
2332             case OP_NOTMINSTAR:
2333             case OP_NOTMINSTARI:
2334             case OP_POSSTAR:
2335             case OP_POSSTARI:
2336             case OP_NOTPOSSTAR:
2337             case OP_NOTPOSSTARI:
2338             case OP_PLUS:
2339             case OP_PLUSI:
2340             case OP_NOTPLUS:
2341             case OP_NOTPLUSI:
2342             case OP_MINPLUS:
2343             case OP_MINPLUSI:
2344             case OP_NOTMINPLUS:
2345             case OP_NOTMINPLUSI:
2346             case OP_POSPLUS:
2347             case OP_POSPLUSI:
2348             case OP_NOTPOSPLUS:
2349             case OP_NOTPOSPLUSI:
2350             case OP_QUERY:
2351             case OP_QUERYI:
2352             case OP_NOTQUERY:
2353             case OP_NOTQUERYI:
2354             case OP_MINQUERY:
2355             case OP_MINQUERYI:
2356             case OP_NOTMINQUERY:
2357             case OP_NOTMINQUERYI:
2358             case OP_POSQUERY:
2359             case OP_POSQUERYI:
2360             case OP_NOTPOSQUERY:
2361             case OP_NOTPOSQUERYI:
2362             if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2363             break;
2364             }
2365             #else
2366             (void)(utf); /* Keep compiler happy by referencing function argument */
2367             #endif
2368             }
2369 24           }
2370             }
2371              
2372              
2373              
2374             /*************************************************
2375             * Scan compiled branch for non-emptiness *
2376             *************************************************/
2377              
2378             /* This function scans through a branch of a compiled pattern to see whether it
2379             can match the empty string or not. It is called from could_be_empty()
2380             below and from compile_branch() when checking for an unlimited repeat of a
2381             group that can match nothing. Note that first_significant_code() skips over
2382             backward and negative forward assertions when its final argument is TRUE. If we
2383             hit an unclosed bracket, we return "empty" - this means we've struck an inner
2384             bracket whose current branch will already have been scanned.
2385              
2386             Arguments:
2387             code points to start of search
2388             endcode points to where to stop
2389             utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2390             cd contains pointers to tables etc.
2391             recurses chain of recurse_check to catch mutual recursion
2392              
2393             Returns: TRUE if what is matched could be empty
2394             */
2395              
2396             static BOOL
2397 46           could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2398             BOOL utf, compile_data *cd, recurse_check *recurses)
2399             {
2400             register pcre_uchar c;
2401             recurse_check this_recurse;
2402              
2403 59 50         for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2404             code < endcode;
2405 13           code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2406             {
2407             const pcre_uchar *ccode;
2408              
2409 59           c = *code;
2410              
2411             /* Skip over forward assertions; the other assertions are skipped by
2412             first_significant_code() with a TRUE final argument. */
2413              
2414 59 50         if (c == OP_ASSERT)
2415             {
2416 0 0         do code += GET(code, 1); while (*code == OP_ALT);
2417 0           c = *code;
2418 0           continue;
2419             }
2420              
2421             /* For a recursion/subroutine call, if its end has been reached, which
2422             implies a backward reference subroutine call, we can scan it. If it's a
2423             forward reference subroutine call, we can't. To detect forward reference
2424             we have to scan up the list that is kept in the workspace. This function is
2425             called only when doing the real compile, not during the pre-compile that
2426             measures the size of the compiled pattern. */
2427              
2428 59 50         if (c == OP_RECURSE)
2429             {
2430 0           const pcre_uchar *scode = cd->start_code + GET(code, 1);
2431 0           const pcre_uchar *endgroup = scode;
2432             BOOL empty_branch;
2433              
2434             /* Test for forward reference or uncompleted reference. This is disabled
2435             when called to scan a completed pattern by setting cd->start_workspace to
2436             NULL. */
2437              
2438 0 0         if (cd->start_workspace != NULL)
2439             {
2440             const pcre_uchar *tcode;
2441 0 0         for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2442 0 0         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2443 0 0         if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2444             }
2445              
2446             /* If the reference is to a completed group, we need to detect whether this
2447             is a recursive call, as otherwise there will be an infinite loop. If it is
2448             a recursion, just skip over it. Simple recursions are easily detected. For
2449             mutual recursions we keep a chain on the stack. */
2450              
2451 0 0         do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2452 0 0         if (code >= scode && code <= endgroup) continue; /* Simple recursion */
    0          
2453             else
2454             {
2455 0           recurse_check *r = recurses;
2456 0 0         for (r = recurses; r != NULL; r = r->prev)
2457 0 0         if (r->group == scode) break;
2458 0 0         if (r != NULL) continue; /* Mutual recursion */
2459             }
2460              
2461             /* Completed reference; scan the referenced group, remembering it on the
2462             stack chain to detect mutual recursions. */
2463              
2464 0           empty_branch = FALSE;
2465 0           this_recurse.prev = recurses;
2466 0           this_recurse.group = scode;
2467              
2468             do
2469             {
2470 0 0         if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2471             {
2472 0           empty_branch = TRUE;
2473 0           break;
2474             }
2475 0           scode += GET(scode, 1);
2476             }
2477 0 0         while (*scode == OP_ALT);
2478              
2479 0 0         if (!empty_branch) return FALSE; /* All branches are non-empty */
2480 0           continue;
2481             }
2482              
2483             /* Groups with zero repeats can of course be empty; skip them. */
2484              
2485 59 50         if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
    50          
    50          
    50          
2486             c == OP_BRAPOSZERO)
2487             {
2488 0           code += PRIV(OP_lengths)[c];
2489 0 0         do code += GET(code, 1); while (*code == OP_ALT);
2490 0           c = *code;
2491 0           continue;
2492             }
2493              
2494             /* A nested group that is already marked as "could be empty" can just be
2495             skipped. */
2496              
2497 59 50         if (c == OP_SBRA || c == OP_SBRAPOS ||
    50          
    50          
2498 59 50         c == OP_SCBRA || c == OP_SCBRAPOS)
2499             {
2500 0 0         do code += GET(code, 1); while (*code == OP_ALT);
2501 0           c = *code;
2502 0           continue;
2503             }
2504              
2505             /* For other groups, scan the branches. */
2506              
2507 59 50         if (c == OP_BRA || c == OP_BRAPOS ||
    50          
    50          
2508 59 50         c == OP_CBRA || c == OP_CBRAPOS ||
    50          
2509 59 50         c == OP_ONCE || c == OP_ONCE_NC ||
    50          
2510 59 50         c == OP_COND || c == OP_SCOND)
2511             {
2512             BOOL empty_branch;
2513 0 0         if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2514              
2515             /* If a conditional group has only one branch, there is a second, implied,
2516             empty branch, so just skip over the conditional, because it could be empty.
2517             Otherwise, scan the individual branches of the group. */
2518              
2519 0 0         if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
    0          
2520 0           code += GET(code, 1);
2521             else
2522             {
2523 0           empty_branch = FALSE;
2524             do
2525             {
2526 0 0         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
    0          
2527 0           recurses)) empty_branch = TRUE;
2528 0           code += GET(code, 1);
2529             }
2530 0 0         while (*code == OP_ALT);
2531 0 0         if (!empty_branch) return FALSE; /* All branches are non-empty */
2532             }
2533              
2534 0           c = *code;
2535 0           continue;
2536             }
2537              
2538             /* Handle the other opcodes */
2539              
2540 59           switch (c)
2541             {
2542             /* Check for quantifiers after a class. XCLASS is used for classes that
2543             cannot be represented just by a bit map. This includes negated single
2544             high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2545             actual length is stored in the compiled code, so we must update "code"
2546             here. */
2547              
2548             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2549             case OP_XCLASS:
2550             ccode = code += GET(code, 1);
2551             goto CHECK_CLASS_REPEAT;
2552             #endif
2553              
2554             case OP_CLASS:
2555             case OP_NCLASS:
2556 0           ccode = code + PRIV(OP_lengths)[OP_CLASS];
2557              
2558             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2559             CHECK_CLASS_REPEAT:
2560             #endif
2561              
2562 0           switch (*ccode)
2563             {
2564             case OP_CRSTAR: /* These could be empty; continue */
2565             case OP_CRMINSTAR:
2566             case OP_CRQUERY:
2567             case OP_CRMINQUERY:
2568             case OP_CRPOSSTAR:
2569             case OP_CRPOSQUERY:
2570 0           break;
2571              
2572             default: /* Non-repeat => class must match */
2573             case OP_CRPLUS: /* These repeats aren't empty */
2574             case OP_CRMINPLUS:
2575             case OP_CRPOSPLUS:
2576 0           return FALSE;
2577              
2578             case OP_CRRANGE:
2579             case OP_CRMINRANGE:
2580             case OP_CRPOSRANGE:
2581 0 0         if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2582 0           break;
2583             }
2584 0           break;
2585              
2586             /* Opcodes that must match a character */
2587              
2588             case OP_ANY:
2589             case OP_ALLANY:
2590             case OP_ANYBYTE:
2591              
2592             case OP_PROP:
2593             case OP_NOTPROP:
2594             case OP_ANYNL:
2595              
2596             case OP_NOT_HSPACE:
2597             case OP_HSPACE:
2598             case OP_NOT_VSPACE:
2599             case OP_VSPACE:
2600             case OP_EXTUNI:
2601              
2602             case OP_NOT_DIGIT:
2603             case OP_DIGIT:
2604             case OP_NOT_WHITESPACE:
2605             case OP_WHITESPACE:
2606             case OP_NOT_WORDCHAR:
2607             case OP_WORDCHAR:
2608              
2609             case OP_CHAR:
2610             case OP_CHARI:
2611             case OP_NOT:
2612             case OP_NOTI:
2613              
2614             case OP_PLUS:
2615             case OP_PLUSI:
2616             case OP_MINPLUS:
2617             case OP_MINPLUSI:
2618              
2619             case OP_NOTPLUS:
2620             case OP_NOTPLUSI:
2621             case OP_NOTMINPLUS:
2622             case OP_NOTMINPLUSI:
2623              
2624             case OP_POSPLUS:
2625             case OP_POSPLUSI:
2626             case OP_NOTPOSPLUS:
2627             case OP_NOTPOSPLUSI:
2628              
2629             case OP_EXACT:
2630             case OP_EXACTI:
2631             case OP_NOTEXACT:
2632             case OP_NOTEXACTI:
2633              
2634             case OP_TYPEPLUS:
2635             case OP_TYPEMINPLUS:
2636             case OP_TYPEPOSPLUS:
2637             case OP_TYPEEXACT:
2638              
2639 41           return FALSE;
2640              
2641             /* These are going to continue, as they may be empty, but we have to
2642             fudge the length for the \p and \P cases. */
2643              
2644             case OP_TYPESTAR:
2645             case OP_TYPEMINSTAR:
2646             case OP_TYPEPOSSTAR:
2647             case OP_TYPEQUERY:
2648             case OP_TYPEMINQUERY:
2649             case OP_TYPEPOSQUERY:
2650 0 0         if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    0          
2651 0           break;
2652              
2653             /* Same for these */
2654              
2655             case OP_TYPEUPTO:
2656             case OP_TYPEMINUPTO:
2657             case OP_TYPEPOSUPTO:
2658 0 0         if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
    0          
2659 0           code += 2;
2660 0           break;
2661              
2662             /* End of branch */
2663              
2664             case OP_KET:
2665             case OP_KETRMAX:
2666             case OP_KETRMIN:
2667             case OP_KETRPOS:
2668             case OP_ALT:
2669 5           return TRUE;
2670              
2671             /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2672             MINUPTO, and POSUPTO and their caseless and negative versions may be
2673             followed by a multibyte character. */
2674              
2675             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2676             case OP_STAR:
2677             case OP_STARI:
2678             case OP_NOTSTAR:
2679             case OP_NOTSTARI:
2680              
2681             case OP_MINSTAR:
2682             case OP_MINSTARI:
2683             case OP_NOTMINSTAR:
2684             case OP_NOTMINSTARI:
2685              
2686             case OP_POSSTAR:
2687             case OP_POSSTARI:
2688             case OP_NOTPOSSTAR:
2689             case OP_NOTPOSSTARI:
2690              
2691             case OP_QUERY:
2692             case OP_QUERYI:
2693             case OP_NOTQUERY:
2694             case OP_NOTQUERYI:
2695              
2696             case OP_MINQUERY:
2697             case OP_MINQUERYI:
2698             case OP_NOTMINQUERY:
2699             case OP_NOTMINQUERYI:
2700              
2701             case OP_POSQUERY:
2702             case OP_POSQUERYI:
2703             case OP_NOTPOSQUERY:
2704             case OP_NOTPOSQUERYI:
2705              
2706             if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2707             break;
2708              
2709             case OP_UPTO:
2710             case OP_UPTOI:
2711             case OP_NOTUPTO:
2712             case OP_NOTUPTOI:
2713              
2714             case OP_MINUPTO:
2715             case OP_MINUPTOI:
2716             case OP_NOTMINUPTO:
2717             case OP_NOTMINUPTOI:
2718              
2719             case OP_POSUPTO:
2720             case OP_POSUPTOI:
2721             case OP_NOTPOSUPTO:
2722             case OP_NOTPOSUPTOI:
2723              
2724             if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2725             break;
2726             #endif
2727              
2728             /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2729             string. */
2730              
2731             case OP_MARK:
2732             case OP_PRUNE_ARG:
2733             case OP_SKIP_ARG:
2734             case OP_THEN_ARG:
2735 0           code += code[1];
2736 0           break;
2737              
2738             /* None of the remaining opcodes are required to match a character. */
2739              
2740             default:
2741 13           break;
2742             }
2743             }
2744              
2745 46           return TRUE;
2746             }
2747              
2748              
2749              
2750             /*************************************************
2751             * Scan compiled regex for non-emptiness *
2752             *************************************************/
2753              
2754             /* This function is called to check for left recursive calls. We want to check
2755             the current branch of the current pattern to see if it could match the empty
2756             string. If it could, we must look outwards for branches at other levels,
2757             stopping when we pass beyond the bracket which is the subject of the recursion.
2758             This function is called only during the real compile, not during the
2759             pre-compile.
2760              
2761             Arguments:
2762             code points to start of the recursion
2763             endcode points to where to stop (current RECURSE item)
2764             bcptr points to the chain of current (unclosed) branch starts
2765             utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2766             cd pointers to tables etc
2767              
2768             Returns: TRUE if what is matched could be empty
2769             */
2770              
2771             static BOOL
2772 0           could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2773             branch_chain *bcptr, BOOL utf, compile_data *cd)
2774             {
2775 0 0         while (bcptr != NULL && bcptr->current_branch >= code)
    0          
2776             {
2777 0 0         if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2778 0           return FALSE;
2779 0           bcptr = bcptr->outer;
2780             }
2781 0           return TRUE;
2782             }
2783              
2784              
2785              
2786             /*************************************************
2787             * Base opcode of repeated opcodes *
2788             *************************************************/
2789              
2790             /* Returns the base opcode for repeated single character type opcodes. If the
2791             opcode is not a repeated character type, it returns with the original value.
2792              
2793             Arguments: c opcode
2794             Returns: base opcode for the type
2795             */
2796              
2797             static pcre_uchar
2798 82           get_repeat_base(pcre_uchar c)
2799             {
2800 82 50         return (c > OP_TYPEPOSUPTO)? c :
    50          
    0          
    0          
    0          
2801             (c >= OP_TYPESTAR)? OP_TYPESTAR :
2802             (c >= OP_NOTSTARI)? OP_NOTSTARI :
2803             (c >= OP_NOTSTAR)? OP_NOTSTAR :
2804             (c >= OP_STARI)? OP_STARI :
2805             OP_STAR;
2806             }
2807              
2808              
2809              
2810             #ifdef SUPPORT_UCP
2811             /*************************************************
2812             * Check a character and a property *
2813             *************************************************/
2814              
2815             /* This function is called by check_auto_possessive() when a property item
2816             is adjacent to a fixed character.
2817              
2818             Arguments:
2819             c the character
2820             ptype the property type
2821             pdata the data for the type
2822             negated TRUE if it's a negated property (\P or \p{^)
2823              
2824             Returns: TRUE if auto-possessifying is OK
2825             */
2826              
2827             static BOOL
2828             check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2829             BOOL negated)
2830             {
2831             const pcre_uint32 *p;
2832             const ucd_record *prop = GET_UCD(c);
2833              
2834             switch(ptype)
2835             {
2836             case PT_LAMP:
2837             return (prop->chartype == ucp_Lu ||
2838             prop->chartype == ucp_Ll ||
2839             prop->chartype == ucp_Lt) == negated;
2840              
2841             case PT_GC:
2842             return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2843              
2844             case PT_PC:
2845             return (pdata == prop->chartype) == negated;
2846              
2847             case PT_SC:
2848             return (pdata == prop->script) == negated;
2849              
2850             /* These are specials */
2851              
2852             case PT_ALNUM:
2853             return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2854             PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2855              
2856             /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2857             means that Perl space and POSIX space are now identical. PCRE was changed
2858             at release 8.34. */
2859              
2860             case PT_SPACE: /* Perl space */
2861             case PT_PXSPACE: /* POSIX space */
2862             switch(c)
2863             {
2864             HSPACE_CASES:
2865             VSPACE_CASES:
2866             return negated;
2867              
2868             default:
2869             return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2870             }
2871             break; /* Control never reaches here */
2872              
2873             case PT_WORD:
2874             return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2875             PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2876             c == CHAR_UNDERSCORE) == negated;
2877              
2878             case PT_CLIST:
2879             p = PRIV(ucd_caseless_sets) + prop->caseset;
2880             for (;;)
2881             {
2882             if (c < *p) return !negated;
2883             if (c == *p++) return negated;
2884             }
2885             break; /* Control never reaches here */
2886             }
2887              
2888             return FALSE;
2889             }
2890             #endif /* SUPPORT_UCP */
2891              
2892              
2893              
2894             /*************************************************
2895             * Fill the character property list *
2896             *************************************************/
2897              
2898             /* Checks whether the code points to an opcode that can take part in auto-
2899             possessification, and if so, fills a list with its properties.
2900              
2901             Arguments:
2902             code points to start of expression
2903             utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2904             fcc points to case-flipping table
2905             list points to output list
2906             list[0] will be filled with the opcode
2907             list[1] will be non-zero if this opcode
2908             can match an empty character string
2909             list[2..7] depends on the opcode
2910              
2911             Returns: points to the start of the next opcode if *code is accepted
2912             NULL if *code is not accepted
2913             */
2914              
2915             static const pcre_uchar *
2916 79           get_chr_property_list(const pcre_uchar *code, BOOL utf,
2917             const pcre_uint8 *fcc, pcre_uint32 *list)
2918             {
2919 79           pcre_uchar c = *code;
2920             pcre_uchar base;
2921             const pcre_uchar *end;
2922             pcre_uint32 chr;
2923              
2924             #ifdef SUPPORT_UCP
2925             pcre_uint32 *clist_dest;
2926             const pcre_uint32 *clist_src;
2927             #else
2928             ((void)utf); /* Suppress "unused parameter" compiler warning */
2929             #endif
2930              
2931 79           list[0] = c;
2932 79           list[1] = FALSE;
2933 79           code++;
2934              
2935 79 100         if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
    50          
2936             {
2937 41           base = get_repeat_base(c);
2938 41           c -= (base - OP_STAR);
2939              
2940 41 50         if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
    50          
    50          
    50          
2941 0           code += IMM2_SIZE;
2942              
2943 41 100         list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
    50          
    50          
    50          
2944              
2945 41           switch(base)
2946             {
2947             case OP_STAR:
2948 0           list[0] = OP_CHAR;
2949 0           break;
2950              
2951             case OP_STARI:
2952 0           list[0] = OP_CHARI;
2953 0           break;
2954              
2955             case OP_NOTSTAR:
2956 0           list[0] = OP_NOT;
2957 0           break;
2958              
2959             case OP_NOTSTARI:
2960 0           list[0] = OP_NOTI;
2961 0           break;
2962              
2963             case OP_TYPESTAR:
2964 41           list[0] = *code;
2965 41           code++;
2966 41           break;
2967             }
2968 41           c = list[0];
2969             }
2970              
2971 79           switch(c)
2972             {
2973             case OP_NOT_DIGIT:
2974             case OP_DIGIT:
2975             case OP_NOT_WHITESPACE:
2976             case OP_WHITESPACE:
2977             case OP_NOT_WORDCHAR:
2978             case OP_WORDCHAR:
2979             case OP_ANY:
2980             case OP_ALLANY:
2981             case OP_ANYNL:
2982             case OP_NOT_HSPACE:
2983             case OP_HSPACE:
2984             case OP_NOT_VSPACE:
2985             case OP_VSPACE:
2986             case OP_EXTUNI:
2987             case OP_EODN:
2988             case OP_EOD:
2989             case OP_DOLL:
2990             case OP_DOLLM:
2991 41           return code;
2992              
2993             case OP_CHAR:
2994             case OP_NOT:
2995 38           GETCHARINCTEST(chr, code);
2996 38           list[2] = chr;
2997 38           list[3] = NOTACHAR;
2998 38           return code;
2999              
3000             case OP_CHARI:
3001             case OP_NOTI:
3002 0 0         list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3003 0           GETCHARINCTEST(chr, code);
3004 0           list[2] = chr;
3005              
3006             #ifdef SUPPORT_UCP
3007             if (chr < 128 || (chr < 256 && !utf))
3008             list[3] = fcc[chr];
3009             else
3010             list[3] = UCD_OTHERCASE(chr);
3011             #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3012             list[3] = (chr < 256) ? fcc[chr] : chr;
3013             #else
3014 0           list[3] = fcc[chr];
3015             #endif
3016              
3017             /* The othercase might be the same value. */
3018              
3019 0 0         if (chr == list[3])
3020 0           list[3] = NOTACHAR;
3021             else
3022 0           list[4] = NOTACHAR;
3023 0           return code;
3024              
3025             #ifdef SUPPORT_UCP
3026             case OP_PROP:
3027             case OP_NOTPROP:
3028             if (code[0] != PT_CLIST)
3029             {
3030             list[2] = code[0];
3031             list[3] = code[1];
3032             return code + 2;
3033             }
3034              
3035             /* Convert only if we have enough space. */
3036              
3037             clist_src = PRIV(ucd_caseless_sets) + code[1];
3038             clist_dest = list + 2;
3039             code += 2;
3040              
3041             do {
3042             if (clist_dest >= list + 8)
3043             {
3044             /* Early return if there is not enough space. This should never
3045             happen, since all clists are shorter than 5 character now. */
3046             list[2] = code[0];
3047             list[3] = code[1];
3048             return code;
3049             }
3050             *clist_dest++ = *clist_src;
3051             }
3052             while(*clist_src++ != NOTACHAR);
3053              
3054             /* All characters are stored. The terminating NOTACHAR
3055             is copied form the clist itself. */
3056              
3057             list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3058             return code;
3059             #endif
3060              
3061             case OP_NCLASS:
3062             case OP_CLASS:
3063             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3064             case OP_XCLASS:
3065             if (c == OP_XCLASS)
3066             end = code + GET(code, 0) - 1;
3067             else
3068             #endif
3069 0           end = code + 32 / sizeof(pcre_uchar);
3070              
3071 0           switch(*end)
3072             {
3073             case OP_CRSTAR:
3074             case OP_CRMINSTAR:
3075             case OP_CRQUERY:
3076             case OP_CRMINQUERY:
3077             case OP_CRPOSSTAR:
3078             case OP_CRPOSQUERY:
3079 0           list[1] = TRUE;
3080 0           end++;
3081 0           break;
3082              
3083             case OP_CRPLUS:
3084             case OP_CRMINPLUS:
3085             case OP_CRPOSPLUS:
3086 0           end++;
3087 0           break;
3088              
3089             case OP_CRRANGE:
3090             case OP_CRMINRANGE:
3091             case OP_CRPOSRANGE:
3092 0           list[1] = (GET2(end, 1) == 0);
3093 0           end += 1 + 2 * IMM2_SIZE;
3094 0           break;
3095             }
3096 0           list[2] = (pcre_uint32)(end - code);
3097 0           return end;
3098             }
3099 0           return NULL; /* Opcode not accepted */
3100             }
3101              
3102              
3103              
3104             /*************************************************
3105             * Scan further character sets for match *
3106             *************************************************/
3107              
3108             /* Checks whether the base and the current opcode have a common character, in
3109             which case the base cannot be possessified.
3110              
3111             Arguments:
3112             code points to the byte code
3113             utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3114             cd static compile data
3115             base_list the data list of the base opcode
3116              
3117             Returns: TRUE if the auto-possessification is possible
3118             */
3119              
3120             static BOOL
3121 41           compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3122             const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3123             {
3124             pcre_uchar c;
3125             pcre_uint32 list[8];
3126             const pcre_uint32 *chr_ptr;
3127             const pcre_uint32 *ochr_ptr;
3128             const pcre_uint32 *list_ptr;
3129             const pcre_uchar *next_code;
3130             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3131             const pcre_uchar *xclass_flags;
3132             #endif
3133             const pcre_uint8 *class_bitset;
3134             const pcre_uint8 *set1, *set2, *set_end;
3135             pcre_uint32 chr;
3136             BOOL accepted, invert_bits;
3137 41           BOOL entered_a_group = FALSE;
3138              
3139 41 50         if (*rec_limit == 0) return FALSE;
3140 41           --(*rec_limit);
3141              
3142             /* Note: the base_list[1] contains whether the current opcode has greedy
3143             (represented by a non-zero value) quantifier. This is a different from
3144             other character type lists, which stores here that the character iterator
3145             matches to an empty string (also represented by a non-zero value). */
3146              
3147             for(;;)
3148             {
3149             /* All operations move the code pointer forward.
3150             Therefore infinite recursions are not possible. */
3151              
3152 44           c = *code;
3153              
3154             /* Skip over callouts */
3155              
3156 44 50         if (c == OP_CALLOUT)
3157             {
3158 0           code += PRIV(OP_lengths)[c];
3159 0           continue;
3160             }
3161              
3162 44 50         if (c == OP_ALT)
3163             {
3164 0 0         do code += GET(code, 1); while (*code == OP_ALT);
3165 0           c = *code;
3166             }
3167              
3168 44           switch(c)
3169             {
3170             case OP_END:
3171             case OP_KETRPOS:
3172             /* TRUE only in greedy case. The non-greedy case could be replaced by
3173             an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3174             uses more memory, which we cannot get at this stage.) */
3175              
3176 3           return base_list[1] != 0;
3177              
3178             case OP_KET:
3179             /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3180             it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3181             cannot be converted to a possessive form. */
3182              
3183 3 50         if (base_list[1] == 0) return FALSE;
3184              
3185 3 50         switch(*(code - GET(code, 1)))
3186             {
3187             case OP_ASSERT:
3188             case OP_ASSERT_NOT:
3189             case OP_ASSERTBACK:
3190             case OP_ASSERTBACK_NOT:
3191             case OP_ONCE:
3192             case OP_ONCE_NC:
3193             /* Atomic sub-patterns and assertions can always auto-possessify their
3194             last iterator. However, if the group was entered as a result of checking
3195             a previous iterator, this is not possible. */
3196              
3197 0           return !entered_a_group;
3198             }
3199              
3200 3           code += PRIV(OP_lengths)[c];
3201 3           continue;
3202              
3203             case OP_ONCE:
3204             case OP_ONCE_NC:
3205             case OP_BRA:
3206             case OP_CBRA:
3207 0           next_code = code + GET(code, 1);
3208 0           code += PRIV(OP_lengths)[c];
3209              
3210 0 0         while (*next_code == OP_ALT)
3211             {
3212 0 0         if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3213 0           return FALSE;
3214 0           code = next_code + 1 + LINK_SIZE;
3215 0           next_code += GET(next_code, 1);
3216             }
3217              
3218 0           entered_a_group = TRUE;
3219 0           continue;
3220              
3221             case OP_BRAZERO:
3222             case OP_BRAMINZERO:
3223              
3224 0           next_code = code + 1;
3225 0 0         if (*next_code != OP_BRA && *next_code != OP_CBRA
    0          
3226 0 0         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
    0          
3227              
3228 0 0         do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3229              
3230             /* The bracket content will be checked by the
3231             OP_BRA/OP_CBRA case above. */
3232 0           next_code += 1 + LINK_SIZE;
3233 0 0         if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3234 0           return FALSE;
3235              
3236 0           code += PRIV(OP_lengths)[c];
3237 0           continue;
3238              
3239             default:
3240 38           break;
3241             }
3242              
3243             /* Check for a supported opcode, and load its properties. */
3244              
3245 38           code = get_chr_property_list(code, utf, cd->fcc, list);
3246 38 50         if (code == NULL) return FALSE; /* Unsupported */
3247              
3248             /* If either opcode is a small character list, set pointers for comparing
3249             characters from that list with another list, or with a property. */
3250              
3251 38 50         if (base_list[0] == OP_CHAR)
3252             {
3253 0           chr_ptr = base_list + 2;
3254 0           list_ptr = list;
3255             }
3256 38 50         else if (list[0] == OP_CHAR)
3257             {
3258 38           chr_ptr = list + 2;
3259 38           list_ptr = base_list;
3260             }
3261              
3262             /* Character bitsets can also be compared to certain opcodes. */
3263              
3264 0 0         else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
    0          
3265             #ifdef COMPILE_PCRE8
3266             /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3267 0 0         || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
    0          
    0          
3268             #endif
3269             )
3270             {
3271             #ifdef COMPILE_PCRE8
3272 0 0         if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
    0          
    0          
3273             #else
3274             if (base_list[0] == OP_CLASS)
3275             #endif
3276             {
3277 0           set1 = (pcre_uint8 *)(base_end - base_list[2]);
3278 0           list_ptr = list;
3279             }
3280             else
3281             {
3282 0           set1 = (pcre_uint8 *)(code - list[2]);
3283 0           list_ptr = base_list;
3284             }
3285              
3286 0           invert_bits = FALSE;
3287 0           switch(list_ptr[0])
3288             {
3289             case OP_CLASS:
3290             case OP_NCLASS:
3291 0           set2 = (pcre_uint8 *)
3292 0 0         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3293 0           break;
3294              
3295             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3296             case OP_XCLASS:
3297             xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3298             if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3299             if ((*xclass_flags & XCL_MAP) == 0)
3300             {
3301             /* No bits are set for characters < 256. */
3302             if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
3303             /* Might be an empty repeat. */
3304             continue;
3305             }
3306             set2 = (pcre_uint8 *)(xclass_flags + 1);
3307             break;
3308             #endif
3309              
3310             case OP_NOT_DIGIT:
3311 0           invert_bits = TRUE;
3312             /* Fall through */
3313             case OP_DIGIT:
3314 0           set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3315 0           break;
3316              
3317             case OP_NOT_WHITESPACE:
3318 0           invert_bits = TRUE;
3319             /* Fall through */
3320             case OP_WHITESPACE:
3321 0           set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3322 0           break;
3323              
3324             case OP_NOT_WORDCHAR:
3325 0           invert_bits = TRUE;
3326             /* Fall through */
3327             case OP_WORDCHAR:
3328 0           set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3329 0           break;
3330              
3331             default:
3332 0           return FALSE;
3333             }
3334              
3335             /* Because the sets are unaligned, we need
3336             to perform byte comparison here. */
3337 0           set_end = set1 + 32;
3338 0 0         if (invert_bits)
3339             {
3340             do
3341             {
3342 0 0         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3343             }
3344 0 0         while (set1 < set_end);
3345             }
3346             else
3347             {
3348             do
3349             {
3350 0 0         if ((*set1++ & *set2++) != 0) return FALSE;
3351             }
3352 0 0         while (set1 < set_end);
3353             }
3354              
3355 0 0         if (list[1] == 0) return TRUE;
3356             /* Might be an empty repeat. */
3357 0           continue;
3358             }
3359              
3360             /* Some property combinations also acceptable. Unicode property opcodes are
3361             processed specially; the rest can be handled with a lookup table. */
3362              
3363             else
3364             {
3365             pcre_uint32 leftop, rightop;
3366              
3367 0           leftop = base_list[0];
3368 0           rightop = list[0];
3369              
3370             #ifdef SUPPORT_UCP
3371             accepted = FALSE; /* Always set in non-unicode case. */
3372             if (leftop == OP_PROP || leftop == OP_NOTPROP)
3373             {
3374             if (rightop == OP_EOD)
3375             accepted = TRUE;
3376             else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3377             {
3378             int n;
3379             const pcre_uint8 *p;
3380             BOOL same = leftop == rightop;
3381             BOOL lisprop = leftop == OP_PROP;
3382             BOOL risprop = rightop == OP_PROP;
3383             BOOL bothprop = lisprop && risprop;
3384              
3385             /* There's a table that specifies how each combination is to be
3386             processed:
3387             0 Always return FALSE (never auto-possessify)
3388             1 Character groups are distinct (possessify if both are OP_PROP)
3389             2 Check character categories in the same group (general or particular)
3390             3 Return TRUE if the two opcodes are not the same
3391             ... see comments below
3392             */
3393              
3394             n = propposstab[base_list[2]][list[2]];
3395             switch(n)
3396             {
3397             case 0: break;
3398             case 1: accepted = bothprop; break;
3399             case 2: accepted = (base_list[3] == list[3]) != same; break;
3400             case 3: accepted = !same; break;
3401              
3402             case 4: /* Left general category, right particular category */
3403             accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3404             break;
3405              
3406             case 5: /* Right general category, left particular category */
3407             accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3408             break;
3409              
3410             /* This code is logically tricky. Think hard before fiddling with it.
3411             The posspropstab table has four entries per row. Each row relates to
3412             one of PCRE's special properties such as ALNUM or SPACE or WORD.
3413             Only WORD actually needs all four entries, but using repeats for the
3414             others means they can all use the same code below.
3415              
3416             The first two entries in each row are Unicode general categories, and
3417             apply always, because all the characters they include are part of the
3418             PCRE character set. The third and fourth entries are a general and a
3419             particular category, respectively, that include one or more relevant
3420             characters. One or the other is used, depending on whether the check
3421             is for a general or a particular category. However, in both cases the
3422             category contains more characters than the specials that are defined
3423             for the property being tested against. Therefore, it cannot be used
3424             in a NOTPROP case.
3425              
3426             Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3427             Underscore is covered by ucp_P or ucp_Po. */
3428              
3429             case 6: /* Left alphanum vs right general category */
3430             case 7: /* Left space vs right general category */
3431             case 8: /* Left word vs right general category */
3432             p = posspropstab[n-6];
3433             accepted = risprop && lisprop ==
3434             (list[3] != p[0] &&
3435             list[3] != p[1] &&
3436             (list[3] != p[2] || !lisprop));
3437             break;
3438              
3439             case 9: /* Right alphanum vs left general category */
3440             case 10: /* Right space vs left general category */
3441             case 11: /* Right word vs left general category */
3442             p = posspropstab[n-9];
3443             accepted = lisprop && risprop ==
3444             (base_list[3] != p[0] &&
3445             base_list[3] != p[1] &&
3446             (base_list[3] != p[2] || !risprop));
3447             break;
3448              
3449             case 12: /* Left alphanum vs right particular category */
3450             case 13: /* Left space vs right particular category */
3451             case 14: /* Left word vs right particular category */
3452             p = posspropstab[n-12];
3453             accepted = risprop && lisprop ==
3454             (catposstab[p[0]][list[3]] &&
3455             catposstab[p[1]][list[3]] &&
3456             (list[3] != p[3] || !lisprop));
3457             break;
3458              
3459             case 15: /* Right alphanum vs left particular category */
3460             case 16: /* Right space vs left particular category */
3461             case 17: /* Right word vs left particular category */
3462             p = posspropstab[n-15];
3463             accepted = lisprop && risprop ==
3464             (catposstab[p[0]][base_list[3]] &&
3465             catposstab[p[1]][base_list[3]] &&
3466             (base_list[3] != p[3] || !risprop));
3467             break;
3468             }
3469             }
3470             }
3471              
3472             else
3473             #endif /* SUPPORT_UCP */
3474              
3475 0 0         accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
    0          
3476 0 0         rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
    0          
    0          
3477 0           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3478              
3479 0 0         if (!accepted) return FALSE;
3480              
3481 0 0         if (list[1] == 0) return TRUE;
3482             /* Might be an empty repeat. */
3483 0           continue;
3484             }
3485              
3486             /* Control reaches here only if one of the items is a small character list.
3487             All characters are checked against the other side. */
3488              
3489             do
3490             {
3491 38           chr = *chr_ptr;
3492              
3493 38           switch(list_ptr[0])
3494             {
3495             case OP_CHAR:
3496 0           ochr_ptr = list_ptr + 2;
3497             do
3498             {
3499 0 0         if (chr == *ochr_ptr) return FALSE;
3500 0           ochr_ptr++;
3501             }
3502 0 0         while(*ochr_ptr != NOTACHAR);
3503 0           break;
3504              
3505             case OP_NOT:
3506 0           ochr_ptr = list_ptr + 2;
3507             do
3508             {
3509 0 0         if (chr == *ochr_ptr)
3510 0           break;
3511 0           ochr_ptr++;
3512             }
3513 0 0         while(*ochr_ptr != NOTACHAR);
3514 0 0         if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3515 0           break;
3516              
3517             /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3518             set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3519              
3520             case OP_DIGIT:
3521 0 0         if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
    0          
3522 0           break;
3523              
3524             case OP_NOT_DIGIT:
3525 0 0         if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
    0          
3526 0           break;
3527              
3528             case OP_WHITESPACE:
3529 0 0         if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
    0          
3530 0           break;
3531              
3532             case OP_NOT_WHITESPACE:
3533 0 0         if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
    0          
3534 0           break;
3535              
3536             case OP_WORDCHAR:
3537 0 0         if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
    0          
3538 0           break;
3539              
3540             case OP_NOT_WORDCHAR:
3541 0 0         if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
    0          
3542 0           break;
3543              
3544             case OP_HSPACE:
3545 0 0         switch(chr)
3546             {
3547 0           HSPACE_CASES: return FALSE;
3548 0           default: break;
3549             }
3550 0           break;
3551              
3552             case OP_NOT_HSPACE:
3553 0 0         switch(chr)
3554             {
3555 0           HSPACE_CASES: break;
3556 0           default: return FALSE;
3557             }
3558 0           break;
3559              
3560             case OP_ANYNL:
3561             case OP_VSPACE:
3562 0 0         switch(chr)
3563             {
3564 0           VSPACE_CASES: return FALSE;
3565 0           default: break;
3566             }
3567 0           break;
3568              
3569             case OP_NOT_VSPACE:
3570 0 0         switch(chr)
3571             {
3572 0           VSPACE_CASES: break;
3573 0           default: return FALSE;
3574             }
3575 0           break;
3576              
3577             case OP_DOLL:
3578             case OP_EODN:
3579 0 0         switch (chr)
3580             {
3581             case CHAR_CR:
3582             case CHAR_LF:
3583             case CHAR_VT:
3584             case CHAR_FF:
3585             case CHAR_NEL:
3586             #ifndef EBCDIC
3587             case 0x2028:
3588             case 0x2029:
3589             #endif /* Not EBCDIC */
3590 0           return FALSE;
3591             }
3592 0           break;
3593              
3594             case OP_EOD: /* Can always possessify before \z */
3595 0           break;
3596              
3597             #ifdef SUPPORT_UCP
3598             case OP_PROP:
3599             case OP_NOTPROP:
3600             if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3601             list_ptr[0] == OP_NOTPROP))
3602             return FALSE;
3603             break;
3604             #endif
3605              
3606             case OP_NCLASS:
3607 0 0         if (chr > 255) return FALSE;
3608             /* Fall through */
3609              
3610             case OP_CLASS:
3611 0 0         if (chr > 255) break;
3612 0           class_bitset = (pcre_uint8 *)
3613 0 0         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3614 0 0         if ((class_bitset[chr >> 3] & (1U << (chr & 7))) != 0) return FALSE;
3615 0           break;
3616              
3617             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3618             case OP_XCLASS:
3619             if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3620             list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3621             break;
3622             #endif
3623              
3624             default:
3625 38           return FALSE;
3626             }
3627              
3628 0           chr_ptr++;
3629             }
3630 0 0         while(*chr_ptr != NOTACHAR);
3631              
3632             /* At least one character must be matched from this opcode. */
3633              
3634 0 0         if (list[1] == 0) return TRUE;
3635 44           }
3636              
3637             /* Control never reaches here. There used to be a fail-save return FALSE; here,
3638             but some compilers complain about an unreachable statement. */
3639              
3640             }
3641              
3642              
3643              
3644             /*************************************************
3645             * Scan compiled regex for auto-possession *
3646             *************************************************/
3647              
3648             /* Replaces single character iterations with their possessive alternatives
3649             if appropriate. This function modifies the compiled opcode!
3650              
3651             Arguments:
3652             code points to start of the byte code
3653             utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3654             cd static compile data
3655              
3656             Returns: nothing
3657             */
3658              
3659             static void
3660 46           auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3661             {
3662             register pcre_uchar c;
3663             const pcre_uchar *end;
3664             pcre_uchar *repeat_opcode;
3665             pcre_uint32 list[8];
3666             int rec_limit;
3667              
3668             for (;;)
3669             {
3670 851           c = *code;
3671              
3672             /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3673             it may compile without complaining, but may get into a loop here if the code
3674             pointer points to a bad value. This is, of course a documentated possibility,
3675             when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3676             just give up on this optimization. */
3677              
3678 851 50         if (c >= OP_TABLE_LENGTH) return;
3679              
3680 851 100         if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
    100          
3681             {
3682 41           c -= get_repeat_base(c) - OP_STAR;
3683 41           end = (c <= OP_MINUPTO) ?
3684 41 50         get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3685 41 100         list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
    50          
    0          
    0          
3686              
3687 41           rec_limit = 1000;
3688 41 50         if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
    100          
3689             {
3690 3           switch(c)
3691             {
3692             case OP_STAR:
3693 0           *code += OP_POSSTAR - OP_STAR;
3694 0           break;
3695              
3696             case OP_MINSTAR:
3697 0           *code += OP_POSSTAR - OP_MINSTAR;
3698 0           break;
3699              
3700             case OP_PLUS:
3701 3           *code += OP_POSPLUS - OP_PLUS;
3702 3           break;
3703              
3704             case OP_MINPLUS:
3705 0           *code += OP_POSPLUS - OP_MINPLUS;
3706 0           break;
3707              
3708             case OP_QUERY:
3709 0           *code += OP_POSQUERY - OP_QUERY;
3710 0           break;
3711              
3712             case OP_MINQUERY:
3713 0           *code += OP_POSQUERY - OP_MINQUERY;
3714 0           break;
3715              
3716             case OP_UPTO:
3717 0           *code += OP_POSUPTO - OP_UPTO;
3718 0           break;
3719              
3720             case OP_MINUPTO:
3721 0           *code += OP_POSUPTO - OP_MINUPTO;
3722 0           break;
3723             }
3724             }
3725 41           c = *code;
3726             }
3727 810 50         else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
    50          
    50          
3728             {
3729             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3730             if (c == OP_XCLASS)
3731             repeat_opcode = code + GET(code, 1);
3732             else
3733             #endif
3734 0           repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3735              
3736 0           c = *repeat_opcode;
3737 0 0         if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
    0          
3738             {
3739             /* end must not be NULL. */
3740 0           end = get_chr_property_list(code, utf, cd->fcc, list);
3741              
3742 0           list[1] = (c & 1) == 0;
3743              
3744 0           rec_limit = 1000;
3745 0 0         if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3746             {
3747 0           switch (c)
3748             {
3749             case OP_CRSTAR:
3750             case OP_CRMINSTAR:
3751 0           *repeat_opcode = OP_CRPOSSTAR;
3752 0           break;
3753              
3754             case OP_CRPLUS:
3755             case OP_CRMINPLUS:
3756 0           *repeat_opcode = OP_CRPOSPLUS;
3757 0           break;
3758              
3759             case OP_CRQUERY:
3760             case OP_CRMINQUERY:
3761 0           *repeat_opcode = OP_CRPOSQUERY;
3762 0           break;
3763              
3764             case OP_CRRANGE:
3765             case OP_CRMINRANGE:
3766 0           *repeat_opcode = OP_CRPOSRANGE;
3767 0           break;
3768             }
3769             }
3770             }
3771 0           c = *code;
3772             }
3773              
3774 851           switch(c)
3775             {
3776             case OP_END:
3777 46           return;
3778              
3779             case OP_TYPESTAR:
3780             case OP_TYPEMINSTAR:
3781             case OP_TYPEPLUS:
3782             case OP_TYPEMINPLUS:
3783             case OP_TYPEQUERY:
3784             case OP_TYPEMINQUERY:
3785             case OP_TYPEPOSSTAR:
3786             case OP_TYPEPOSPLUS:
3787             case OP_TYPEPOSQUERY:
3788 41 50         if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
    50          
3789 41           break;
3790              
3791             case OP_TYPEUPTO:
3792             case OP_TYPEMINUPTO:
3793             case OP_TYPEEXACT:
3794             case OP_TYPEPOSUPTO:
3795 0 0         if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
    0          
3796 0           code += 2;
3797 0           break;
3798              
3799             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3800             case OP_XCLASS:
3801             code += GET(code, 1);
3802             break;
3803             #endif
3804              
3805             case OP_MARK:
3806             case OP_PRUNE_ARG:
3807             case OP_SKIP_ARG:
3808             case OP_THEN_ARG:
3809 0           code += code[1];
3810 0           break;
3811             }
3812              
3813             /* Add in the fixed length from the table */
3814              
3815 805           code += PRIV(OP_lengths)[c];
3816              
3817             /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3818             a multi-byte character. The length in the table is a minimum, so we have to
3819             arrange to skip the extra bytes. */
3820              
3821             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3822             if (utf) switch(c)
3823             {
3824             case OP_CHAR:
3825             case OP_CHARI:
3826             case OP_NOT:
3827             case OP_NOTI:
3828             case OP_STAR:
3829             case OP_MINSTAR:
3830             case OP_PLUS:
3831             case OP_MINPLUS:
3832             case OP_QUERY:
3833             case OP_MINQUERY:
3834             case OP_UPTO:
3835             case OP_MINUPTO:
3836             case OP_EXACT:
3837             case OP_POSSTAR:
3838             case OP_POSPLUS:
3839             case OP_POSQUERY:
3840             case OP_POSUPTO:
3841             case OP_STARI:
3842             case OP_MINSTARI:
3843             case OP_PLUSI:
3844             case OP_MINPLUSI:
3845             case OP_QUERYI:
3846             case OP_MINQUERYI:
3847             case OP_UPTOI:
3848             case OP_MINUPTOI:
3849             case OP_EXACTI:
3850             case OP_POSSTARI:
3851             case OP_POSPLUSI:
3852             case OP_POSQUERYI:
3853             case OP_POSUPTOI:
3854             case OP_NOTSTAR:
3855             case OP_NOTMINSTAR:
3856             case OP_NOTPLUS:
3857             case OP_NOTMINPLUS:
3858             case OP_NOTQUERY:
3859             case OP_NOTMINQUERY:
3860             case OP_NOTUPTO:
3861             case OP_NOTMINUPTO:
3862             case OP_NOTEXACT:
3863             case OP_NOTPOSSTAR:
3864             case OP_NOTPOSPLUS:
3865             case OP_NOTPOSQUERY:
3866             case OP_NOTPOSUPTO:
3867             case OP_NOTSTARI:
3868             case OP_NOTMINSTARI:
3869             case OP_NOTPLUSI:
3870             case OP_NOTMINPLUSI:
3871             case OP_NOTQUERYI:
3872             case OP_NOTMINQUERYI:
3873             case OP_NOTUPTOI:
3874             case OP_NOTMINUPTOI:
3875             case OP_NOTEXACTI:
3876             case OP_NOTPOSSTARI:
3877             case OP_NOTPOSPLUSI:
3878             case OP_NOTPOSQUERYI:
3879             case OP_NOTPOSUPTOI:
3880             if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3881             break;
3882             }
3883             #else
3884             (void)(utf); /* Keep compiler happy by referencing function argument */
3885             #endif
3886 851           }
3887             }
3888              
3889              
3890              
3891             /*************************************************
3892             * Check for POSIX class syntax *
3893             *************************************************/
3894              
3895             /* This function is called when the sequence "[:" or "[." or "[=" is
3896             encountered in a character class. It checks whether this is followed by a
3897             sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3898             reach an unescaped ']' without the special preceding character, return FALSE.
3899              
3900             Originally, this function only recognized a sequence of letters between the
3901             terminators, but it seems that Perl recognizes any sequence of characters,
3902             though of course unknown POSIX names are subsequently rejected. Perl gives an
3903             "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3904             didn't consider this to be a POSIX class. Likewise for [:1234:].
3905              
3906             The problem in trying to be exactly like Perl is in the handling of escapes. We
3907             have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3908             class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3909             below handles the special cases \\ and \], but does not try to do any other
3910             escape processing. This makes it different from Perl for cases such as
3911             [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3912             not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3913             when Perl does, I think.
3914              
3915             A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3916             It seems that the appearance of a nested POSIX class supersedes an apparent
3917             external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3918             a digit.
3919              
3920             In Perl, unescaped square brackets may also appear as part of class names. For
3921             example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3922             [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3923             seem right at all. PCRE does not allow closing square brackets in POSIX class
3924             names.
3925              
3926             Arguments:
3927             ptr pointer to the initial [
3928             endptr where to return the end pointer
3929              
3930             Returns: TRUE or FALSE
3931             */
3932              
3933             static BOOL
3934 0           check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3935             {
3936             pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3937 0           terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3938 0 0         for (++ptr; *ptr != CHAR_NULL; ptr++)
3939             {
3940 0 0         if (*ptr == CHAR_BACKSLASH &&
    0          
3941 0 0         (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3942 0           ptr[1] == CHAR_BACKSLASH))
3943 0           ptr++;
3944 0 0         else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
    0          
    0          
3945 0           *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3946 0 0         else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
    0          
3947             {
3948 0           *endptr = ptr;
3949 0           return TRUE;
3950             }
3951             }
3952 0           return FALSE;
3953             }
3954              
3955              
3956              
3957              
3958             /*************************************************
3959             * Check POSIX class name *
3960             *************************************************/
3961              
3962             /* This function is called to check the name given in a POSIX-style class entry
3963             such as [:alnum:].
3964              
3965             Arguments:
3966             ptr points to the first letter
3967             len the length of the name
3968              
3969             Returns: a value representing the name, or -1 if unknown
3970             */
3971              
3972             static int
3973 0           check_posix_name(const pcre_uchar *ptr, int len)
3974             {
3975 0           const char *pn = posix_names;
3976 0           register int yield = 0;
3977 0 0         while (posix_name_lengths[yield] != 0)
3978             {
3979 0 0         if (len == posix_name_lengths[yield] &&
    0          
3980 0           STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3981 0           pn += posix_name_lengths[yield] + 1;
3982 0           yield++;
3983             }
3984 0           return -1;
3985             }
3986              
3987              
3988             /*************************************************
3989             * Adjust OP_RECURSE items in repeated group *
3990             *************************************************/
3991              
3992             /* OP_RECURSE items contain an offset from the start of the regex to the group
3993             that is referenced. This means that groups can be replicated for fixed
3994             repetition simply by copying (because the recursion is allowed to refer to
3995             earlier groups that are outside the current group). However, when a group is
3996             optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3997             inserted before it, after it has been compiled. This means that any OP_RECURSE
3998             items within it that refer to the group itself or any contained groups have to
3999             have their offsets adjusted. That one of the jobs of this function. Before it
4000             is called, the partially compiled regex must be temporarily terminated with
4001             OP_END.
4002              
4003             This function has been extended to cope with forward references for recursions
4004             and subroutine calls. It must check the list of such references for the
4005             group we are dealing with. If it finds that one of the recursions in the
4006             current group is on this list, it does not adjust the value in the reference
4007             (which is a group number). After the group has been scanned, all the offsets in
4008             the forward reference list for the group are adjusted.
4009              
4010             Arguments:
4011             group points to the start of the group
4012             adjust the amount by which the group is to be moved
4013             utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
4014             cd contains pointers to tables etc.
4015             save_hwm_offset the hwm forward reference offset at the start of the group
4016              
4017             Returns: nothing
4018             */
4019              
4020             static void
4021 6           adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4022             size_t save_hwm_offset)
4023             {
4024             int offset;
4025             pcre_uchar *hc;
4026 6           pcre_uchar *ptr = group;
4027              
4028 6 50         while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4029             {
4030 0 0         for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4031 0           hc += LINK_SIZE)
4032             {
4033 0           offset = (int)GET(hc, 0);
4034 0 0         if (cd->start_code + offset == ptr + 1) break;
4035             }
4036              
4037             /* If we have not found this recursion on the forward reference list, adjust
4038             the recursion's offset if it's after the start of this group. */
4039              
4040 0 0         if (hc >= cd->hwm)
4041             {
4042 0           offset = (int)GET(ptr, 1);
4043 0 0         if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4044             }
4045              
4046 0           ptr += 1 + LINK_SIZE;
4047             }
4048              
4049             /* Now adjust all forward reference offsets for the group. */
4050              
4051 6 50         for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4052 0           hc += LINK_SIZE)
4053             {
4054 0           offset = (int)GET(hc, 0);
4055 0           PUT(hc, 0, offset + adjust);
4056             }
4057 6           }
4058              
4059              
4060              
4061             /*************************************************
4062             * Insert an automatic callout point *
4063             *************************************************/
4064              
4065             /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4066             callout points before each pattern item.
4067              
4068             Arguments:
4069             code current code pointer
4070             ptr current pattern pointer
4071             cd pointers to tables etc
4072              
4073             Returns: new code pointer
4074             */
4075              
4076             static pcre_uchar *
4077 0           auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4078             {
4079 0           *code++ = OP_CALLOUT;
4080 0           *code++ = 255;
4081 0           PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
4082 0           PUT(code, LINK_SIZE, 0); /* Default length */
4083 0           return code + 2 * LINK_SIZE;
4084             }
4085              
4086              
4087              
4088             /*************************************************
4089             * Complete a callout item *
4090             *************************************************/
4091              
4092             /* A callout item contains the length of the next item in the pattern, which
4093             we can't fill in till after we have reached the relevant point. This is used
4094             for both automatic and manual callouts.
4095              
4096             Arguments:
4097             previous_callout points to previous callout item
4098             ptr current pattern pointer
4099             cd pointers to tables etc
4100              
4101             Returns: nothing
4102             */
4103              
4104             static void
4105 0           complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4106             {
4107 0           int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4108 0           PUT(previous_callout, 2 + LINK_SIZE, length);
4109 0           }
4110              
4111              
4112              
4113             #ifdef SUPPORT_UCP
4114             /*************************************************
4115             * Get othercase range *
4116             *************************************************/
4117              
4118             /* This function is passed the start and end of a class range, in UTF-8 mode
4119             with UCP support. It searches up the characters, looking for ranges of
4120             characters in the "other" case. Each call returns the next one, updating the
4121             start address. A character with multiple other cases is returned on its own
4122             with a special return value.
4123              
4124             Arguments:
4125             cptr points to starting character value; updated
4126             d end value
4127             ocptr where to put start of othercase range
4128             odptr where to put end of othercase range
4129              
4130             Yield: -1 when no more
4131             0 when a range is returned
4132             >0 the CASESET offset for char with multiple other cases
4133             in this case, ocptr contains the original
4134             */
4135              
4136             static int
4137             get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4138             pcre_uint32 *odptr)
4139             {
4140             pcre_uint32 c, othercase, next;
4141             unsigned int co;
4142              
4143             /* Find the first character that has an other case. If it has multiple other
4144             cases, return its case offset value. */
4145              
4146             for (c = *cptr; c <= d; c++)
4147             {
4148             if ((co = UCD_CASESET(c)) != 0)
4149             {
4150             *ocptr = c++; /* Character that has the set */
4151             *cptr = c; /* Rest of input range */
4152             return (int)co;
4153             }
4154             if ((othercase = UCD_OTHERCASE(c)) != c) break;
4155             }
4156              
4157             if (c > d) return -1; /* Reached end of range */
4158              
4159             /* Found a character that has a single other case. Search for the end of the
4160             range, which is either the end of the input range, or a character that has zero
4161             or more than one other cases. */
4162              
4163             *ocptr = othercase;
4164             next = othercase + 1;
4165              
4166             for (++c; c <= d; c++)
4167             {
4168             if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4169             next++;
4170             }
4171              
4172             *odptr = next - 1; /* End of othercase range */
4173             *cptr = c; /* Rest of input range */
4174             return 0;
4175             }
4176             #endif /* SUPPORT_UCP */
4177              
4178              
4179              
4180             /*************************************************
4181             * Add a character or range to a class *
4182             *************************************************/
4183              
4184             /* This function packages up the logic of adding a character or range of
4185             characters to a class. The character values in the arguments will be within the
4186             valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4187             mutually recursive with the function immediately below.
4188              
4189             Arguments:
4190             classbits the bit map for characters < 256
4191             uchardptr points to the pointer for extra data
4192             options the options word
4193             cd contains pointers to tables etc.
4194             start start of range character
4195             end end of range character
4196              
4197             Returns: the number of < 256 characters added
4198             the pointer to extra data is updated
4199             */
4200              
4201             static int
4202 0           add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4203             compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4204             {
4205             pcre_uint32 c;
4206 0           pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4207 0           int n8 = 0;
4208              
4209             ((void)uchardptr);
4210             ((void)propposstab);
4211             ((void)catposstab);
4212             ((void)posspropstab);
4213              
4214             /* If caseless matching is required, scan the range and process alternate
4215             cases. In Unicode, there are 8-bit characters that have alternate cases that
4216             are greater than 255 and vice-versa. Sometimes we can just extend the original
4217             range. */
4218              
4219 0 0         if ((options & PCRE_CASELESS) != 0)
4220             {
4221             #ifdef SUPPORT_UCP
4222             if ((options & PCRE_UTF8) != 0)
4223             {
4224             int rc;
4225             pcre_uint32 oc, od;
4226              
4227             options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4228             c = start;
4229              
4230             while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4231             {
4232             /* Handle a single character that has more than one other case. */
4233              
4234             if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4235             PRIV(ucd_caseless_sets) + rc, oc);
4236              
4237             /* Do nothing if the other case range is within the original range. */
4238              
4239             else if (oc >= start && od <= end) continue;
4240              
4241             /* Extend the original range if there is overlap, noting that if oc < c, we
4242             can't have od > end because a subrange is always shorter than the basic
4243             range. Otherwise, use a recursive call to add the additional range. */
4244              
4245             else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4246             else if (od > end && oc <= end + 1)
4247             {
4248             end = od; /* Extend upwards */
4249             if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4250             }
4251             else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4252             }
4253             }
4254             else
4255             #endif /* SUPPORT_UCP */
4256              
4257             /* Not UTF-mode, or no UCP */
4258              
4259 0 0         for (c = start; c <= classbits_end; c++)
4260             {
4261 0           SETBIT(classbits, cd->fcc[c]);
4262 0           n8++;
4263             }
4264             }
4265              
4266             /* Now handle the original range. Adjust the final value according to the bit
4267             length - this means that the same lists of (e.g.) horizontal spaces can be used
4268             in all cases. */
4269              
4270             #if defined COMPILE_PCRE8
4271             #ifdef SUPPORT_UTF
4272             if ((options & PCRE_UTF8) == 0)
4273             #endif
4274 0 0         if (end > 0xff) end = 0xff;
4275              
4276             #elif defined COMPILE_PCRE16
4277             #ifdef SUPPORT_UTF
4278             if ((options & PCRE_UTF16) == 0)
4279             #endif
4280             if (end > 0xffff) end = 0xffff;
4281              
4282             #endif /* COMPILE_PCRE[8|16] */
4283              
4284             /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4285              
4286 0 0         for (c = start; c <= classbits_end; c++)
4287             {
4288             /* Regardless of start, c will always be <= 255. */
4289 0           SETBIT(classbits, c);
4290 0           n8++;
4291             }
4292              
4293             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4294             if (start <= 0xff) start = 0xff + 1;
4295              
4296             if (end >= start)
4297             {
4298             pcre_uchar *uchardata = *uchardptr;
4299             #ifdef SUPPORT_UTF
4300             if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4301             {
4302             if (start < end)
4303             {
4304             *uchardata++ = XCL_RANGE;
4305             uchardata += PRIV(ord2utf)(start, uchardata);
4306             uchardata += PRIV(ord2utf)(end, uchardata);
4307             }
4308             else if (start == end)
4309             {
4310             *uchardata++ = XCL_SINGLE;
4311             uchardata += PRIV(ord2utf)(start, uchardata);
4312             }
4313             }
4314             else
4315             #endif /* SUPPORT_UTF */
4316              
4317             /* Without UTF support, character values are constrained by the bit length,
4318             and can only be > 256 for 16-bit and 32-bit libraries. */
4319              
4320             #ifdef COMPILE_PCRE8
4321             {}
4322             #else
4323             if (start < end)
4324             {
4325             *uchardata++ = XCL_RANGE;
4326             *uchardata++ = start;
4327             *uchardata++ = end;
4328             }
4329             else if (start == end)
4330             {
4331             *uchardata++ = XCL_SINGLE;
4332             *uchardata++ = start;
4333             }
4334             #endif
4335              
4336             *uchardptr = uchardata; /* Updata extra data pointer */
4337             }
4338             #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4339              
4340 0           return n8; /* Number of 8-bit characters */
4341             }
4342              
4343              
4344              
4345              
4346             /*************************************************
4347             * Add a list of characters to a class *
4348             *************************************************/
4349              
4350             /* This function is used for adding a list of case-equivalent characters to a
4351             class, and also for adding a list of horizontal or vertical whitespace. If the
4352             list is in order (which it should be), ranges of characters are detected and
4353             handled appropriately. This function is mutually recursive with the function
4354             above.
4355              
4356             Arguments:
4357             classbits the bit map for characters < 256
4358             uchardptr points to the pointer for extra data
4359             options the options word
4360             cd contains pointers to tables etc.
4361             p points to row of 32-bit values, terminated by NOTACHAR
4362             except character to omit; this is used when adding lists of
4363             case-equivalent characters to avoid including the one we
4364             already know about
4365              
4366             Returns: the number of < 256 characters added
4367             the pointer to extra data is updated
4368             */
4369              
4370             static int
4371 0           add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4372             compile_data *cd, const pcre_uint32 *p, unsigned int except)
4373             {
4374 0           int n8 = 0;
4375 0 0         while (p[0] < NOTACHAR)
4376             {
4377 0           int n = 0;
4378 0 0         if (p[0] != except)
4379             {
4380 0 0         while(p[n+1] == p[0] + n + 1) n++;
4381 0           n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4382             }
4383 0           p += n + 1;
4384             }
4385 0           return n8;
4386             }
4387              
4388              
4389              
4390             /*************************************************
4391             * Add characters not in a list to a class *
4392             *************************************************/
4393              
4394             /* This function is used for adding the complement of a list of horizontal or
4395             vertical whitespace to a class. The list must be in order.
4396              
4397             Arguments:
4398             classbits the bit map for characters < 256
4399             uchardptr points to the pointer for extra data
4400             options the options word
4401             cd contains pointers to tables etc.
4402             p points to row of 32-bit values, terminated by NOTACHAR
4403              
4404             Returns: the number of < 256 characters added
4405             the pointer to extra data is updated
4406             */
4407              
4408             static int
4409 0           add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4410             int options, compile_data *cd, const pcre_uint32 *p)
4411             {
4412 0           BOOL utf = (options & PCRE_UTF8) != 0;
4413 0           int n8 = 0;
4414 0 0         if (p[0] > 0)
4415 0           n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4416 0 0         while (p[0] < NOTACHAR)
4417             {
4418 0 0         while (p[1] == p[0] + 1) p++;
4419 0 0         n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4420 0 0         (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4421 0           p++;
4422             }
4423 0           return n8;
4424             }
4425              
4426              
4427              
4428             /*************************************************
4429             * Compile one branch *
4430             *************************************************/
4431              
4432             /* Scan the pattern, compiling it into the a vector. If the options are
4433             changed during the branch, the pointer is used to change the external options
4434             bits. This function is used during the pre-compile phase when we are trying
4435             to find out the amount of memory needed, as well as during the real compile
4436             phase. The value of lengthptr distinguishes the two phases.
4437              
4438             Arguments:
4439             optionsptr pointer to the option bits
4440             codeptr points to the pointer to the current code point
4441             ptrptr points to the current pattern pointer
4442             errorcodeptr points to error code variable
4443             firstcharptr place to put the first required character
4444             firstcharflagsptr place to put the first character flags, or a negative number
4445             reqcharptr place to put the last required character
4446             reqcharflagsptr place to put the last required character flags, or a negative number
4447             bcptr points to current branch chain
4448             cond_depth conditional nesting depth
4449             cd contains pointers to tables etc.
4450             lengthptr NULL during the real compile phase
4451             points to length accumulator during pre-compile phase
4452              
4453             Returns: TRUE on success
4454             FALSE, with *errorcodeptr set non-zero on error
4455             */
4456              
4457             static BOOL
4458 98           compile_branch(int *optionsptr, pcre_uchar **codeptr,
4459             const pcre_uchar **ptrptr, int *errorcodeptr,
4460             pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4461             pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4462             branch_chain *bcptr, int cond_depth,
4463             compile_data *cd, int *lengthptr)
4464             {
4465             int repeat_type, op_type;
4466 98           int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4467 98           int bravalue = 0;
4468             int greedy_default, greedy_non_default;
4469             pcre_uint32 firstchar, reqchar;
4470             pcre_int32 firstcharflags, reqcharflags;
4471             pcre_uint32 zeroreqchar, zerofirstchar;
4472             pcre_int32 zeroreqcharflags, zerofirstcharflags;
4473             pcre_int32 req_caseopt, reqvary, tempreqvary;
4474 98           int options = *optionsptr; /* May change dynamically */
4475 98           int after_manual_callout = 0;
4476 98           int length_prevgroup = 0;
4477             register pcre_uint32 c;
4478             int escape;
4479 98           register pcre_uchar *code = *codeptr;
4480 98           pcre_uchar *last_code = code;
4481 98           pcre_uchar *orig_code = code;
4482             pcre_uchar *tempcode;
4483 98           BOOL inescq = FALSE;
4484 98           BOOL groupsetfirstchar = FALSE;
4485 98           const pcre_uchar *ptr = *ptrptr;
4486             const pcre_uchar *tempptr;
4487 98           const pcre_uchar *nestptr = NULL;
4488 98           pcre_uchar *previous = NULL;
4489 98           pcre_uchar *previous_callout = NULL;
4490 98           size_t item_hwm_offset = 0;
4491             pcre_uint8 classbits[32];
4492              
4493             /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4494             must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4495             dynamically as we process the pattern. */
4496              
4497             #ifdef SUPPORT_UTF
4498             /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4499             BOOL utf = (options & PCRE_UTF8) != 0;
4500             #ifndef COMPILE_PCRE32
4501             pcre_uchar utf_chars[6];
4502             #endif
4503             #else
4504 98           BOOL utf = FALSE;
4505             #endif
4506              
4507             /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4508             class_uchardata always so that it can be passed to add_to_class() always,
4509             though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4510             alternative calls for the different cases. */
4511              
4512             pcre_uchar *class_uchardata;
4513             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4514             BOOL xclass;
4515             pcre_uchar *class_uchardata_base;
4516             #endif
4517              
4518             #ifdef PCRE_DEBUG
4519             if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4520             #endif
4521              
4522             /* Set up the default and non-default settings for greediness */
4523              
4524 98           greedy_default = ((options & PCRE_UNGREEDY) != 0);
4525 98           greedy_non_default = greedy_default ^ 1;
4526              
4527             /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4528             matching encountered yet". It gets changed to REQ_NONE if we hit something that
4529             matches a non-fixed char first char; reqchar just remains unset if we never
4530             find one.
4531              
4532             When we hit a repeat whose minimum is zero, we may have to adjust these values
4533             to take the zero repeat into account. This is implemented by setting them to
4534             zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4535             item types that can be repeated set these backoff variables appropriately. */
4536              
4537 98           firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4538 98           firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4539              
4540             /* The variable req_caseopt contains either the REQ_CASELESS value
4541             or zero, according to the current setting of the caseless flag. The
4542             REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4543             firstchar or reqchar variables to record the case status of the
4544             value. This is used only for ASCII characters. */
4545              
4546 98           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4547              
4548             /* Switch on next character until the end of the branch */
4549              
4550 1502           for (;; ptr++)
4551             {
4552             BOOL negate_class;
4553             BOOL should_flip_negation;
4554             BOOL possessive_quantifier;
4555             BOOL is_quantifier;
4556             BOOL is_recurse;
4557             BOOL reset_bracount;
4558             int class_has_8bitchar;
4559             int class_one_char;
4560             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4561             BOOL xclass_has_prop;
4562             #endif
4563             int newoptions;
4564             int recno;
4565             int refsign;
4566             int skipbytes;
4567             pcre_uint32 subreqchar, subfirstchar;
4568             pcre_int32 subreqcharflags, subfirstcharflags;
4569             int terminator;
4570             unsigned int mclength;
4571             unsigned int tempbracount;
4572             pcre_uint32 ec;
4573             pcre_uchar mcbuffer[8];
4574              
4575             /* Come here to restart the loop without advancing the pointer. */
4576              
4577             REDO_LOOP:
4578              
4579             /* Get next character in the pattern */
4580              
4581 1600           c = *ptr;
4582              
4583             /* If we are at the end of a nested substitution, revert to the outer level
4584             string. Nesting only happens one level deep. */
4585              
4586 1600 100         if (c == CHAR_NULL && nestptr != NULL)
    50          
4587             {
4588 0           ptr = nestptr;
4589 0           nestptr = NULL;
4590 0           c = *ptr;
4591             }
4592              
4593             /* If we are in the pre-compile phase, accumulate the length used for the
4594             previous cycle of this loop. */
4595              
4596 1600 100         if (lengthptr != NULL)
4597             {
4598             #ifdef PCRE_DEBUG
4599             if (code > cd->hwm) cd->hwm = code; /* High water info */
4600             #endif
4601 800 50         if (code > cd->start_workspace + cd->workspace_size -
4602             WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4603             {
4604 0           *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4605 0 0         ERR52 : ERR87;
4606 0           goto FAILED;
4607             }
4608              
4609             /* There is at least one situation where code goes backwards: this is the
4610             case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4611             the class is simply eliminated. However, it is created first, so we have to
4612             allow memory for it. Therefore, don't ever reduce the length at this point.
4613             */
4614              
4615 800 50         if (code < last_code) code = last_code;
4616              
4617             /* Paranoid check for integer overflow */
4618              
4619 800 50         if (OFLOW_MAX - *lengthptr < code - last_code)
4620             {
4621 0           *errorcodeptr = ERR20;
4622 0           goto FAILED;
4623             }
4624              
4625 800           *lengthptr += (int)(code - last_code);
4626             DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4627             (int)(code - last_code), c, c));
4628              
4629             /* If "previous" is set and it is not at the start of the work space, move
4630             it back to there, in order to avoid filling up the work space. Otherwise,
4631             if "previous" is NULL, reset the current code pointer to the start. */
4632              
4633 800 100         if (previous != NULL)
4634             {
4635 691 100         if (previous > orig_code)
4636             {
4637 606           memmove(orig_code, previous, IN_UCHARS(code - previous));
4638 606           code -= previous - orig_code;
4639 691           previous = orig_code;
4640             }
4641             }
4642 109           else code = orig_code;
4643              
4644             /* Remember where this code item starts so we can pick up the length
4645             next time round. */
4646              
4647 800           last_code = code;
4648             }
4649              
4650             /* In the real compile phase, just check the workspace used by the forward
4651             reference list. */
4652              
4653 800 50         else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4654             {
4655 0           *errorcodeptr = ERR52;
4656 0           goto FAILED;
4657             }
4658              
4659             /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4660             isolated \E is ignored. */
4661              
4662 1600 100         if (c != CHAR_NULL)
4663             {
4664 1508 100         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
    50          
4665             {
4666 0           inescq = FALSE;
4667 0           ptr++;
4668 0           continue;
4669             }
4670 1508 50         else if (inescq)
4671             {
4672 0 0         if (previous_callout != NULL)
4673             {
4674 0 0         if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4675 0           complete_callout(previous_callout, ptr, cd);
4676 0           previous_callout = NULL;
4677             }
4678 0 0         if ((options & PCRE_AUTO_CALLOUT) != 0)
4679             {
4680 0           previous_callout = code;
4681 0           code = auto_callout(code, ptr, cd);
4682             }
4683 0           goto NORMAL_CHAR;
4684             }
4685              
4686             /* Check for the start of a \Q...\E sequence. We must do this here rather
4687             than later in case it is immediately followed by \E, which turns it into a
4688             "do nothing" sequence. */
4689              
4690 1508 100         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
    50          
4691             {
4692 0           inescq = TRUE;
4693 0           ptr++;
4694 0           continue;
4695             }
4696             }
4697              
4698             /* In extended mode, skip white space and comments. */
4699              
4700 1600 50         if ((options & PCRE_EXTENDED) != 0)
4701             {
4702 0           const pcre_uchar *wscptr = ptr;
4703 0 0         while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4704 0 0         if (c == CHAR_NUMBER_SIGN)
4705             {
4706 0           ptr++;
4707 0 0         while (*ptr != CHAR_NULL)
4708             {
4709 0 0         if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
    0          
    0          
    0          
    0          
    0          
    0          
    0          
4710             { /* IS_NEWLINE sets cd->nllen. */
4711 0           ptr += cd->nllen;
4712 0           break;
4713             }
4714 0           ptr++;
4715             #ifdef SUPPORT_UTF
4716             if (utf) FORWARDCHAR(ptr);
4717             #endif
4718             }
4719             }
4720              
4721             /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4722             a comment. */
4723              
4724 0 0         if (ptr > wscptr) goto REDO_LOOP;
4725             }
4726              
4727             /* Skip over (?# comments. We need to do this here because we want to know if
4728             the next thing is a quantifier, and these comments may come between an item
4729             and its quantifier. */
4730              
4731 1600 100         if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
    50          
    0          
4732 0           ptr[2] == CHAR_NUMBER_SIGN)
4733             {
4734 0           ptr += 3;
4735 0 0         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
    0          
4736 0 0         if (*ptr == CHAR_NULL)
4737             {
4738 0           *errorcodeptr = ERR18;
4739 0           goto FAILED;
4740             }
4741 0           continue;
4742             }
4743              
4744             /* See if the next thing is a quantifier. */
4745              
4746 1600           is_quantifier =
4747 1600 100         c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
    100          
    100          
    50          
4748 0 0         (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4749              
4750             /* Fill in length of a previous callout, except when the next thing is a
4751             quantifier or when processing a property substitution string in UCP mode. */
4752              
4753 1600 100         if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
    50          
    0          
    0          
4754 0           after_manual_callout-- <= 0)
4755             {
4756 0 0         if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4757 0           complete_callout(previous_callout, ptr, cd);
4758 0           previous_callout = NULL;
4759             }
4760              
4761             /* Create auto callout, except for quantifiers, or while processing property
4762             strings that are substituted for \w etc in UCP mode. */
4763              
4764 1600 50         if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
    0          
    0          
4765             {
4766 0           previous_callout = code;
4767 0           code = auto_callout(code, ptr, cd);
4768             }
4769              
4770             /* Process the next pattern item. */
4771              
4772 1600           switch(c)
4773             {
4774             /* ===================================================================*/
4775             case CHAR_NULL: /* The branch terminates at string end */
4776             case CHAR_VERTICAL_LINE: /* or | or ) */
4777             case CHAR_RIGHT_PARENTHESIS:
4778 98           *firstcharptr = firstchar;
4779 98           *firstcharflagsptr = firstcharflags;
4780 98           *reqcharptr = reqchar;
4781 98           *reqcharflagsptr = reqcharflags;
4782 98           *codeptr = code;
4783 98           *ptrptr = ptr;
4784 98 100         if (lengthptr != NULL)
4785             {
4786 49 50         if (OFLOW_MAX - *lengthptr < code - last_code)
4787             {
4788 0           *errorcodeptr = ERR20;
4789 0           goto FAILED;
4790             }
4791 49           *lengthptr += (int)(code - last_code); /* To include callout length */
4792             DPRINTF((">> end branch\n"));
4793             }
4794 98           return TRUE;
4795              
4796              
4797             /* ===================================================================*/
4798             /* Handle single-character metacharacters. In multiline mode, ^ disables
4799             the setting of any following char as a first character. */
4800              
4801             case CHAR_CIRCUMFLEX_ACCENT:
4802 16           previous = NULL;
4803 16 50         if ((options & PCRE_MULTILINE) != 0)
4804             {
4805 0 0         if (firstcharflags == REQ_UNSET)
4806 0           zerofirstcharflags = firstcharflags = REQ_NONE;
4807 0           *code++ = OP_CIRCM;
4808             }
4809 16           else *code++ = OP_CIRC;
4810 16           break;
4811              
4812             case CHAR_DOLLAR_SIGN:
4813 16           previous = NULL;
4814 16 50         *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4815 16           break;
4816              
4817             /* There can never be a first char if '.' is first, whatever happens about
4818             repeats. The value of reqchar doesn't change either. */
4819              
4820             case CHAR_DOT:
4821 82 50         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4822 82           zerofirstchar = firstchar;
4823 82           zerofirstcharflags = firstcharflags;
4824 82           zeroreqchar = reqchar;
4825 82           zeroreqcharflags = reqcharflags;
4826 82           previous = code;
4827 82           item_hwm_offset = cd->hwm - cd->start_workspace;
4828 82 50         *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4829 82           break;
4830              
4831              
4832             /* ===================================================================*/
4833             /* Character classes. If the included characters are all < 256, we build a
4834             32-byte bitmap of the permitted characters, except in the special case
4835             where there is only one such character. For negated classes, we build the
4836             map as usual, then invert it at the end. However, we use a different opcode
4837             so that data characters > 255 can be handled correctly.
4838              
4839             If the class contains characters outside the 0-255 range, a different
4840             opcode is compiled. It may optionally have a bit map for characters < 256,
4841             but those above are are explicitly listed afterwards. A flag byte tells
4842             whether the bitmap is present, and whether this is a negated class or not.
4843              
4844             In JavaScript compatibility mode, an isolated ']' causes an error. In
4845             default (Perl) mode, it is treated as a data character. */
4846              
4847             case CHAR_RIGHT_SQUARE_BRACKET:
4848 0 0         if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4849             {
4850 0           *errorcodeptr = ERR64;
4851 0           goto FAILED;
4852             }
4853 0           goto NORMAL_CHAR;
4854              
4855             /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4856             used for "start of word" and "end of word". As these are otherwise illegal
4857             sequences, we don't break anything by recognizing them. They are replaced
4858             by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4859             erroneous and are handled by the normal code below. */
4860              
4861             case CHAR_LEFT_SQUARE_BRACKET:
4862 0 0         if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4863             {
4864 0           nestptr = ptr + 7;
4865 0           ptr = sub_start_of_word;
4866 0           goto REDO_LOOP;
4867             }
4868              
4869 0 0         if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4870             {
4871 0           nestptr = ptr + 7;
4872 0           ptr = sub_end_of_word;
4873 0           goto REDO_LOOP;
4874             }
4875              
4876             /* Handle a real character class. */
4877              
4878 0           previous = code;
4879 0           item_hwm_offset = cd->hwm - cd->start_workspace;
4880              
4881             /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4882             they are encountered at the top level, so we'll do that too. */
4883              
4884 0 0         if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
    0          
    0          
4885 0 0         ptr[1] == CHAR_EQUALS_SIGN) &&
4886 0           check_posix_syntax(ptr, &tempptr))
4887             {
4888 0 0         *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4889 0           goto FAILED;
4890             }
4891              
4892             /* If the first character is '^', set the negation flag and skip it. Also,
4893             if the first few characters (either before or after ^) are \Q\E or \E we
4894             skip them too. This makes for compatibility with Perl. */
4895              
4896 0           negate_class = FALSE;
4897             for (;;)
4898             {
4899 0           c = *(++ptr);
4900 0 0         if (c == CHAR_BACKSLASH)
4901             {
4902 0 0         if (ptr[1] == CHAR_E)
4903 0           ptr++;
4904 0 0         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4905 0           ptr += 3;
4906             else
4907 0           break;
4908             }
4909 0 0         else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
    0          
4910 0           negate_class = TRUE;
4911             else break;
4912 0           }
4913              
4914             /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4915             an initial ']' is taken as a data character -- the code below handles
4916             that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4917             [^] must match any character, so generate OP_ALLANY. */
4918              
4919 0 0         if (c == CHAR_RIGHT_SQUARE_BRACKET &&
    0          
4920 0           (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4921             {
4922 0 0         *code++ = negate_class? OP_ALLANY : OP_FAIL;
4923 0 0         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4924 0           zerofirstchar = firstchar;
4925 0           zerofirstcharflags = firstcharflags;
4926 0           break;
4927             }
4928              
4929             /* If a class contains a negative special such as \S, we need to flip the
4930             negation flag at the end, so that support for characters > 255 works
4931             correctly (they are all included in the class). */
4932              
4933 0           should_flip_negation = FALSE;
4934              
4935             /* Extended class (xclass) will be used when characters > 255
4936             might match. */
4937              
4938             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4939             xclass = FALSE;
4940             class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4941             class_uchardata_base = class_uchardata; /* Save the start */
4942             #endif
4943              
4944             /* For optimization purposes, we track some properties of the class:
4945             class_has_8bitchar will be non-zero if the class contains at least one <
4946             256 character; class_one_char will be 1 if the class contains just one
4947             character; xclass_has_prop will be TRUE if unicode property checks
4948             are present in the class. */
4949              
4950 0           class_has_8bitchar = 0;
4951 0           class_one_char = 0;
4952             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4953             xclass_has_prop = FALSE;
4954             #endif
4955              
4956             /* Initialize the 32-char bit map to all zeros. We build the map in a
4957             temporary bit of memory, in case the class contains fewer than two
4958             8-bit characters because in that case the compiled code doesn't use the bit
4959             map. */
4960              
4961 0           memset(classbits, 0, 32 * sizeof(pcre_uint8));
4962              
4963             /* Process characters until ] is reached. By writing this as a "do" it
4964             means that an initial ] is taken as a data character. At the start of the
4965             loop, c contains the first byte of the character. */
4966              
4967 0 0         if (c != CHAR_NULL) do
4968             {
4969             const pcre_uchar *oldptr;
4970              
4971             #ifdef SUPPORT_UTF
4972             if (utf && HAS_EXTRALEN(c))
4973             { /* Braces are required because the */
4974             GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4975             }
4976             #endif
4977              
4978             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4979             /* In the pre-compile phase, accumulate the length of any extra
4980             data and reset the pointer. This is so that very large classes that
4981             contain a zillion > 255 characters no longer overwrite the work space
4982             (which is on the stack). We have to remember that there was XCLASS data,
4983             however. */
4984              
4985             if (class_uchardata > class_uchardata_base) xclass = TRUE;
4986              
4987             if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4988             {
4989             *lengthptr += (int)(class_uchardata - class_uchardata_base);
4990             class_uchardata = class_uchardata_base;
4991             }
4992             #endif
4993              
4994             /* Inside \Q...\E everything is literal except \E */
4995              
4996 0 0         if (inescq)
4997             {
4998 0 0         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
    0          
4999             {
5000 0           inescq = FALSE; /* Reset literal state */
5001 0           ptr++; /* Skip the 'E' */
5002 0           continue; /* Carry on with next */
5003             }
5004 0           goto CHECK_RANGE; /* Could be range if \E follows */
5005             }
5006              
5007             /* Handle POSIX class names. Perl allows a negation extension of the
5008             form [:^name:]. A square bracket that doesn't match the syntax is
5009             treated as a literal. We also recognize the POSIX constructions
5010             [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5011             5.6 and 5.8 do. */
5012              
5013 0 0         if (c == CHAR_LEFT_SQUARE_BRACKET &&
    0          
5014 0 0         (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
    0          
5015 0 0         ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5016             {
5017 0           BOOL local_negate = FALSE;
5018             int posix_class, taboffset, tabopt;
5019 0           register const pcre_uint8 *cbits = cd->cbits;
5020             pcre_uint8 pbits[32];
5021              
5022 0 0         if (ptr[1] != CHAR_COLON)
5023             {
5024 0           *errorcodeptr = ERR31;
5025 0           goto FAILED;
5026             }
5027              
5028 0           ptr += 2;
5029 0 0         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5030             {
5031 0           local_negate = TRUE;
5032 0           should_flip_negation = TRUE; /* Note negative special */
5033 0           ptr++;
5034             }
5035              
5036 0           posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5037 0 0         if (posix_class < 0)
5038             {
5039 0           *errorcodeptr = ERR30;
5040 0           goto FAILED;
5041             }
5042              
5043             /* If matching is caseless, upper and lower are converted to
5044             alpha. This relies on the fact that the class table starts with
5045             alpha, lower, upper as the first 3 entries. */
5046              
5047 0 0         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
    0          
5048 0           posix_class = 0;
5049              
5050             /* When PCRE_UCP is set, some of the POSIX classes are converted to
5051             different escape sequences that use Unicode properties \p or \P. Others
5052             that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5053             directly. */
5054              
5055             #ifdef SUPPORT_UCP
5056             if ((options & PCRE_UCP) != 0)
5057             {
5058             unsigned int ptype = 0;
5059             int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5060              
5061             /* The posix_substitutes table specifies which POSIX classes can be
5062             converted to \p or \P items. */
5063              
5064             if (posix_substitutes[pc] != NULL)
5065             {
5066             nestptr = tempptr + 1;
5067             ptr = posix_substitutes[pc] - 1;
5068             continue;
5069             }
5070              
5071             /* There are three other classes that generate special property calls
5072             that are recognized only in an XCLASS. */
5073              
5074             else switch(posix_class)
5075             {
5076             case PC_GRAPH:
5077             ptype = PT_PXGRAPH;
5078             /* Fall through */
5079             case PC_PRINT:
5080             if (ptype == 0) ptype = PT_PXPRINT;
5081             /* Fall through */
5082             case PC_PUNCT:
5083             if (ptype == 0) ptype = PT_PXPUNCT;
5084             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5085             *class_uchardata++ = ptype;
5086             *class_uchardata++ = 0;
5087             xclass_has_prop = TRUE;
5088             ptr = tempptr + 1;
5089             continue;
5090              
5091             /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5092             to fall through to the non-UCP case and build a bit map for
5093             characters with code points less than 256. If we are in a negated
5094             POSIX class, characters with code points greater than 255 must
5095             either all match or all not match. In the special case where we
5096             have not yet generated any xclass data, and this is the final item
5097             in the overall class, we need do nothing: later on, the opcode
5098             OP_NCLASS will be used to indicate that characters greater than 255
5099             are acceptable. If we have already seen an xclass item or one may
5100             follow (we have to assume that it might if this is not the end of
5101             the class), explicitly list all wide codepoints, which will then
5102             either not match or match, depending on whether the class is or is
5103             not negated. */
5104              
5105             default:
5106             if (local_negate &&
5107             (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5108             {
5109             *class_uchardata++ = XCL_RANGE;
5110             class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5111             class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5112             }
5113             break;
5114             }
5115             }
5116             #endif
5117             /* In the non-UCP case, or when UCP makes no difference, we build the
5118             bit map for the POSIX class in a chunk of local store because we may be
5119             adding and subtracting from it, and we don't want to subtract bits that
5120             may be in the main map already. At the end we or the result into the
5121             bit map that is being built. */
5122              
5123 0           posix_class *= 3;
5124              
5125             /* Copy in the first table (always present) */
5126              
5127 0           memcpy(pbits, cbits + posix_class_maps[posix_class],
5128             32 * sizeof(pcre_uint8));
5129              
5130             /* If there is a second table, add or remove it as required. */
5131              
5132 0           taboffset = posix_class_maps[posix_class + 1];
5133 0           tabopt = posix_class_maps[posix_class + 2];
5134              
5135 0 0         if (taboffset >= 0)
5136             {
5137 0 0         if (tabopt >= 0)
5138 0 0         for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5139             else
5140 0 0         for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5141             }
5142              
5143             /* Now see if we need to remove any special characters. An option
5144             value of 1 removes vertical space and 2 removes underscore. */
5145              
5146 0 0         if (tabopt < 0) tabopt = -tabopt;
5147 0 0         if (tabopt == 1) pbits[1] &= ~0x3c;
5148 0 0         else if (tabopt == 2) pbits[11] &= 0x7f;
5149              
5150             /* Add the POSIX table or its complement into the main table that is
5151             being built and we are done. */
5152              
5153 0 0         if (local_negate)
5154 0 0         for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5155             else
5156 0 0         for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5157              
5158 0           ptr = tempptr + 1;
5159             /* Every class contains at least one < 256 character. */
5160 0           class_has_8bitchar = 1;
5161             /* Every class contains at least two characters. */
5162 0           class_one_char = 2;
5163 0           continue; /* End of POSIX syntax handling */
5164             }
5165              
5166             /* Backslash may introduce a single character, or it may introduce one
5167             of the specials, which just set a flag. The sequence \b is a special
5168             case. Inside a class (and only there) it is treated as backspace. We
5169             assume that other escapes have more than one character in them, so
5170             speculatively set both class_has_8bitchar and class_one_char bigger
5171             than one. Unrecognized escapes fall through and are either treated
5172             as literal characters (by default), or are faulted if
5173             PCRE_EXTRA is set. */
5174              
5175 0 0         if (c == CHAR_BACKSLASH)
5176             {
5177 0           escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5178             TRUE);
5179 0 0         if (*errorcodeptr != 0) goto FAILED;
5180 0 0         if (escape == 0) c = ec;
5181 0 0         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5182 0 0         else if (escape == ESC_N) /* \N is not supported in a class */
5183             {
5184 0           *errorcodeptr = ERR71;
5185 0           goto FAILED;
5186             }
5187 0 0         else if (escape == ESC_Q) /* Handle start of quoted string */
5188             {
5189 0 0         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
    0          
5190             {
5191 0           ptr += 2; /* avoid empty string */
5192             }
5193 0           else inescq = TRUE;
5194 0           continue;
5195             }
5196 0 0         else if (escape == ESC_E) continue; /* Ignore orphan \E */
5197              
5198             else
5199             {
5200 0           register const pcre_uint8 *cbits = cd->cbits;
5201             /* Every class contains at least two < 256 characters. */
5202 0           class_has_8bitchar++;
5203             /* Every class contains at least two characters. */
5204 0           class_one_char += 2;
5205              
5206 0           switch (escape)
5207             {
5208             #ifdef SUPPORT_UCP
5209             case ESC_du: /* These are the values given for \d etc */
5210             case ESC_DU: /* when PCRE_UCP is set. We replace the */
5211             case ESC_wu: /* escape sequence with an appropriate \p */
5212             case ESC_WU: /* or \P to test Unicode properties instead */
5213             case ESC_su: /* of the default ASCII testing. */
5214             case ESC_SU:
5215             nestptr = ptr;
5216             ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
5217             class_has_8bitchar--; /* Undo! */
5218             continue;
5219             #endif
5220             case ESC_d:
5221 0 0         for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5222 0           continue;
5223              
5224             case ESC_D:
5225 0           should_flip_negation = TRUE;
5226 0 0         for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5227 0           continue;
5228              
5229             case ESC_w:
5230 0 0         for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5231 0           continue;
5232              
5233             case ESC_W:
5234 0           should_flip_negation = TRUE;
5235 0 0         for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5236 0           continue;
5237              
5238             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5239             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5240             previously set by something earlier in the character class.
5241             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5242             we could just adjust the appropriate bit. From PCRE 8.34 we no
5243             longer treat \s and \S specially. */
5244              
5245             case ESC_s:
5246 0 0         for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5247 0           continue;
5248              
5249             case ESC_S:
5250 0           should_flip_negation = TRUE;
5251 0 0         for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5252 0           continue;
5253              
5254             /* The rest apply in both UCP and non-UCP cases. */
5255              
5256             case ESC_h:
5257 0           (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5258             PRIV(hspace_list), NOTACHAR);
5259 0           continue;
5260              
5261             case ESC_H:
5262 0           (void)add_not_list_to_class(classbits, &class_uchardata, options,
5263             cd, PRIV(hspace_list));
5264 0           continue;
5265              
5266             case ESC_v:
5267 0           (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5268             PRIV(vspace_list), NOTACHAR);
5269 0           continue;
5270              
5271             case ESC_V:
5272 0           (void)add_not_list_to_class(classbits, &class_uchardata, options,
5273             cd, PRIV(vspace_list));
5274 0           continue;
5275              
5276             case ESC_p:
5277             case ESC_P:
5278             #ifdef SUPPORT_UCP
5279             {
5280             BOOL negated;
5281             unsigned int ptype = 0, pdata = 0;
5282             if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5283             goto FAILED;
5284             *class_uchardata++ = ((escape == ESC_p) != negated)?
5285             XCL_PROP : XCL_NOTPROP;
5286             *class_uchardata++ = ptype;
5287             *class_uchardata++ = pdata;
5288             xclass_has_prop = TRUE;
5289             class_has_8bitchar--; /* Undo! */
5290             continue;
5291             }
5292             #else
5293 0           *errorcodeptr = ERR45;
5294 0           goto FAILED;
5295             #endif
5296             /* Unrecognized escapes are faulted if PCRE is running in its
5297             strict mode. By default, for compatibility with Perl, they are
5298             treated as literals. */
5299              
5300             default:
5301 0 0         if ((options & PCRE_EXTRA) != 0)
5302             {
5303 0           *errorcodeptr = ERR7;
5304 0           goto FAILED;
5305             }
5306 0           class_has_8bitchar--; /* Undo the speculative increase. */
5307 0           class_one_char -= 2; /* Undo the speculative increase. */
5308 0           c = *ptr; /* Get the final character and fall through */
5309 0           break;
5310             }
5311             }
5312              
5313             /* Fall through if the escape just defined a single character (c >= 0).
5314             This may be greater than 256. */
5315              
5316 0           escape = 0;
5317              
5318             } /* End of backslash handling */
5319              
5320             /* A character may be followed by '-' to form a range. However, Perl does
5321             not permit ']' to be the end of the range. A '-' character at the end is
5322             treated as a literal. Perl ignores orphaned \E sequences entirely. The
5323             code for handling \Q and \E is messy. */
5324              
5325             CHECK_RANGE:
5326 0 0         while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
    0          
5327             {
5328 0           inescq = FALSE;
5329 0           ptr += 2;
5330             }
5331 0           oldptr = ptr;
5332              
5333             /* Remember if \r or \n were explicitly used */
5334              
5335 0 0         if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
    0          
5336              
5337             /* Check for range */
5338              
5339 0 0         if (!inescq && ptr[1] == CHAR_MINUS)
    0          
5340             {
5341             pcre_uint32 d;
5342 0           ptr += 2;
5343 0 0         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
    0          
5344              
5345             /* If we hit \Q (not followed by \E) at this point, go into escaped
5346             mode. */
5347              
5348 0 0         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
    0          
5349             {
5350 0           ptr += 2;
5351 0 0         if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
    0          
5352 0           { ptr += 2; continue; }
5353 0           inescq = TRUE;
5354 0           break;
5355             }
5356              
5357             /* Minus (hyphen) at the end of a class is treated as a literal, so put
5358             back the pointer and jump to handle the character that preceded it. */
5359              
5360 0 0         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
    0          
    0          
5361             {
5362 0           ptr = oldptr;
5363 0           goto CLASS_SINGLE_CHARACTER;
5364             }
5365              
5366             /* Otherwise, we have a potential range; pick up the next character */
5367              
5368             #ifdef SUPPORT_UTF
5369             if (utf)
5370             { /* Braces are required because the */
5371             GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5372             }
5373             else
5374             #endif
5375 0           d = *ptr; /* Not UTF-8 mode */
5376              
5377             /* The second part of a range can be a single-character escape
5378             sequence, but not any of the other escapes. Perl treats a hyphen as a
5379             literal in such circumstances. However, in Perl's warning mode, a
5380             warning is given, so PCRE now faults it as it is almost certainly a
5381             mistake on the user's part. */
5382              
5383 0 0         if (!inescq)
5384             {
5385 0 0         if (d == CHAR_BACKSLASH)
5386             {
5387             int descape;
5388 0           descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5389 0 0         if (*errorcodeptr != 0) goto FAILED;
5390              
5391             /* 0 means a character was put into d; \b is backspace; any other
5392             special causes an error. */
5393              
5394 0 0         if (descape != 0)
5395             {
5396 0 0         if (descape == ESC_b) d = CHAR_BS; else
5397             {
5398 0           *errorcodeptr = ERR83;
5399 0           goto FAILED;
5400             }
5401             }
5402             }
5403              
5404             /* A hyphen followed by a POSIX class is treated in the same way. */
5405              
5406 0 0         else if (d == CHAR_LEFT_SQUARE_BRACKET &&
    0          
5407 0 0         (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
    0          
5408 0 0         ptr[1] == CHAR_EQUALS_SIGN) &&
5409 0           check_posix_syntax(ptr, &tempptr))
5410             {
5411 0           *errorcodeptr = ERR83;
5412 0           goto FAILED;
5413             }
5414             }
5415              
5416             /* Check that the two values are in the correct order. Optimize
5417             one-character ranges. */
5418              
5419 0 0         if (d < c)
5420             {
5421 0           *errorcodeptr = ERR8;
5422 0           goto FAILED;
5423             }
5424 0 0         if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5425              
5426             /* We have found a character range, so single character optimizations
5427             cannot be done anymore. Any value greater than 1 indicates that there
5428             is more than one character. */
5429              
5430 0           class_one_char = 2;
5431              
5432             /* Remember an explicit \r or \n, and add the range to the class. */
5433              
5434 0 0         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
    0          
5435              
5436 0           class_has_8bitchar +=
5437 0           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5438              
5439 0           continue; /* Go get the next char in the class */
5440             }
5441              
5442             /* Handle a single character - we can get here for a normal non-escape
5443             char, or after \ that introduces a single character or for an apparent
5444             range that isn't. Only the value 1 matters for class_one_char, so don't
5445             increase it if it is already 2 or more ... just in case there's a class
5446             with a zillion characters in it. */
5447              
5448             CLASS_SINGLE_CHARACTER:
5449 0 0         if (class_one_char < 2) class_one_char++;
5450              
5451             /* If xclass_has_prop is false and class_one_char is 1, we have the first
5452             single character in the class, and there have been no prior ranges, or
5453             XCLASS items generated by escapes. If this is the final character in the
5454             class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5455             if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5456             can cause firstchar to be set. Otherwise, there can be no first char if
5457             this item is first, whatever repeat count may follow. In the case of
5458             reqchar, save the previous value for reinstating. */
5459              
5460 0 0         if (!inescq &&
    0          
5461             #ifdef SUPPORT_UCP
5462             !xclass_has_prop &&
5463             #endif
5464 0 0         class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5465             {
5466 0           ptr++;
5467 0           zeroreqchar = reqchar;
5468 0           zeroreqcharflags = reqcharflags;
5469              
5470 0 0         if (negate_class)
5471             {
5472             #ifdef SUPPORT_UCP
5473             int d;
5474             #endif
5475 0 0         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5476 0           zerofirstchar = firstchar;
5477 0           zerofirstcharflags = firstcharflags;
5478              
5479             /* For caseless UTF-8 mode when UCP support is available, check
5480             whether this character has more than one other case. If so, generate
5481             a special OP_NOTPROP item instead of OP_NOTI. */
5482              
5483             #ifdef SUPPORT_UCP
5484             if (utf && (options & PCRE_CASELESS) != 0 &&
5485             (d = UCD_CASESET(c)) != 0)
5486             {
5487             *code++ = OP_NOTPROP;
5488             *code++ = PT_CLIST;
5489             *code++ = d;
5490             }
5491             else
5492             #endif
5493             /* Char has only one other case, or UCP not available */
5494              
5495             {
5496 0 0         *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5497             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5498             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5499             code += PRIV(ord2utf)(c, code);
5500             else
5501             #endif
5502 0           *code++ = c;
5503             }
5504              
5505             /* We are finished with this character class */
5506              
5507 0           goto END_CLASS;
5508             }
5509              
5510             /* For a single, positive character, get the value into mcbuffer, and
5511             then we can handle this with the normal one-character code. */
5512              
5513             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5514             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5515             mclength = PRIV(ord2utf)(c, mcbuffer);
5516             else
5517             #endif
5518             {
5519 0           mcbuffer[0] = c;
5520 0           mclength = 1;
5521             }
5522 0           goto ONE_CHAR;
5523             } /* End of 1-char optimization */
5524              
5525             /* There is more than one character in the class, or an XCLASS item
5526             has been generated. Add this character to the class. */
5527              
5528 0           class_has_8bitchar +=
5529 0           add_to_class(classbits, &class_uchardata, options, cd, c, c);
5530             }
5531              
5532             /* Loop until ']' reached. This "while" is the end of the "do" far above.
5533             If we are at the end of an internal nested string, revert to the outer
5534             string. */
5535              
5536 0 0         while (((c = *(++ptr)) != CHAR_NULL ||
5537 0 0         (nestptr != NULL &&
5538 0 0         (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5539 0 0         (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
    0          
5540              
5541             /* Check for missing terminating ']' */
5542              
5543 0 0         if (c == CHAR_NULL)
5544             {
5545 0           *errorcodeptr = ERR6;
5546 0           goto FAILED;
5547             }
5548              
5549             /* We will need an XCLASS if data has been placed in class_uchardata. In
5550             the second phase this is a sufficient test. However, in the pre-compile
5551             phase, class_uchardata gets emptied to prevent workspace overflow, so it
5552             only if the very last character in the class needs XCLASS will it contain
5553             anything at this point. For this reason, xclass gets set TRUE above when
5554             uchar_classdata is emptied, and that's why this code is the way it is here
5555             instead of just doing a test on class_uchardata below. */
5556              
5557             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5558             if (class_uchardata > class_uchardata_base) xclass = TRUE;
5559             #endif
5560              
5561             /* If this is the first thing in the branch, there can be no first char
5562             setting, whatever the repeat count. Any reqchar setting must remain
5563             unchanged after any kind of repeat. */
5564              
5565 0 0         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5566 0           zerofirstchar = firstchar;
5567 0           zerofirstcharflags = firstcharflags;
5568 0           zeroreqchar = reqchar;
5569 0           zeroreqcharflags = reqcharflags;
5570              
5571             /* If there are characters with values > 255, we have to compile an
5572             extended class, with its own opcode, unless there was a negated special
5573             such as \S in the class, and PCRE_UCP is not set, because in that case all
5574             characters > 255 are in the class, so any that were explicitly given as
5575             well can be ignored. If (when there are explicit characters > 255 that must
5576             be listed) there are no characters < 256, we can omit the bitmap in the
5577             actual compiled code. */
5578              
5579             #ifdef SUPPORT_UTF
5580             if (xclass && (xclass_has_prop || !should_flip_negation ||
5581             (options & PCRE_UCP) != 0))
5582             #elif !defined COMPILE_PCRE8
5583             if (xclass && (xclass_has_prop || !should_flip_negation))
5584             #endif
5585             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5586             {
5587             /* For non-UCP wide characters, in a non-negative class containing \S or
5588             similar (should_flip_negation is set), all characters greater than 255
5589             must be in the class. */
5590              
5591             if (
5592             #if defined COMPILE_PCRE8
5593             utf &&
5594             #endif
5595             should_flip_negation && !negate_class && (options & PCRE_UCP) == 0)
5596             {
5597             *class_uchardata++ = XCL_RANGE;
5598             if (utf) /* Will always be utf in the 8-bit library */
5599             {
5600             class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5601             class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5602             }
5603             else /* Can only happen for the 16-bit & 32-bit libraries */
5604             {
5605             #if defined COMPILE_PCRE16
5606             *class_uchardata++ = 0x100;
5607             *class_uchardata++ = 0xffffu;
5608             #elif defined COMPILE_PCRE32
5609             *class_uchardata++ = 0x100;
5610             *class_uchardata++ = 0xffffffffu;
5611             #endif
5612             }
5613             }
5614              
5615             *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5616             *code++ = OP_XCLASS;
5617             code += LINK_SIZE;
5618             *code = negate_class? XCL_NOT:0;
5619             if (xclass_has_prop) *code |= XCL_HASPROP;
5620              
5621             /* If the map is required, move up the extra data to make room for it;
5622             otherwise just move the code pointer to the end of the extra data. */
5623              
5624             if (class_has_8bitchar > 0)
5625             {
5626             *code++ |= XCL_MAP;
5627             memmove(code + (32 / sizeof(pcre_uchar)), code,
5628             IN_UCHARS(class_uchardata - code));
5629             if (negate_class && !xclass_has_prop)
5630             for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5631             memcpy(code, classbits, 32);
5632             code = class_uchardata + (32 / sizeof(pcre_uchar));
5633             }
5634             else code = class_uchardata;
5635              
5636             /* Now fill in the complete length of the item */
5637              
5638             PUT(previous, 1, (int)(code - previous));
5639             break; /* End of class handling */
5640             }
5641              
5642             /* Even though any XCLASS list is now discarded, we must allow for
5643             its memory. */
5644              
5645             if (lengthptr != NULL)
5646             *lengthptr += (int)(class_uchardata - class_uchardata_base);
5647             #endif
5648              
5649             /* If there are no characters > 255, or they are all to be included or
5650             excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5651             whole class was negated and whether there were negative specials such as \S
5652             (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5653             negating it if necessary. */
5654              
5655 0 0         *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5656 0 0         if (lengthptr == NULL) /* Save time in the pre-compile phase */
5657             {
5658 0 0         if (negate_class)
5659 0 0         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5660 0           memcpy(code, classbits, 32);
5661             }
5662 0           code += 32 / sizeof(pcre_uchar);
5663              
5664             END_CLASS:
5665 0           break;
5666              
5667              
5668             /* ===================================================================*/
5669             /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5670             has been tested above. */
5671              
5672             case CHAR_LEFT_CURLY_BRACKET:
5673 0 0         if (!is_quantifier) goto NORMAL_CHAR;
5674 0           ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5675 0 0         if (*errorcodeptr != 0) goto FAILED;
5676 0           goto REPEAT;
5677              
5678             case CHAR_ASTERISK:
5679 72           repeat_min = 0;
5680 72           repeat_max = -1;
5681 72           goto REPEAT;
5682              
5683             case CHAR_PLUS:
5684 10           repeat_min = 1;
5685 10           repeat_max = -1;
5686 10           goto REPEAT;
5687              
5688             case CHAR_QUESTION_MARK:
5689 6           repeat_min = 0;
5690 6           repeat_max = 1;
5691              
5692             REPEAT:
5693 88 50         if (previous == NULL)
5694             {
5695 0           *errorcodeptr = ERR9;
5696 0           goto FAILED;
5697             }
5698              
5699 88 100         if (repeat_min == 0)
5700             {
5701 78           firstchar = zerofirstchar; /* Adjust for zero repeat */
5702 78           firstcharflags = zerofirstcharflags;
5703 78           reqchar = zeroreqchar; /* Ditto */
5704 78           reqcharflags = zeroreqcharflags;
5705             }
5706              
5707             /* Remember whether this is a variable length repeat */
5708              
5709 88 50         reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5710              
5711 88           op_type = 0; /* Default single-char op codes */
5712 88           possessive_quantifier = FALSE; /* Default not possessive quantifier */
5713              
5714             /* Save start of previous item, in case we have to move it up in order to
5715             insert something before it. */
5716              
5717 88           tempcode = previous;
5718              
5719             /* Before checking for a possessive quantifier, we must skip over
5720             whitespace and comments in extended mode because Perl allows white space at
5721             this point. */
5722              
5723 88 50         if ((options & PCRE_EXTENDED) != 0)
5724             {
5725 0           const pcre_uchar *p = ptr + 1;
5726             for (;;)
5727             {
5728 0 0         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5729 0 0         if (*p != CHAR_NUMBER_SIGN) break;
5730 0           p++;
5731 0 0         while (*p != CHAR_NULL)
5732             {
5733 0 0         if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */
    0          
    0          
    0          
    0          
    0          
    0          
    0          
5734             { /* IS_NEWLINE sets cd->nllen. */
5735 0           p += cd->nllen;
5736 0           break;
5737             }
5738 0           p++;
5739             #ifdef SUPPORT_UTF
5740             if (utf) FORWARDCHAR(p);
5741             #endif
5742             } /* Loop for comment characters */
5743 0           } /* Loop for multiple comments */
5744 0           ptr = p - 1; /* Character before the next significant one. */
5745             }
5746              
5747             /* We also need to skip over (?# comments, which are not dependent on
5748             extended mode. */
5749              
5750 88 50         if (ptr[1] == CHAR_LEFT_PARENTHESIS && ptr[2] == CHAR_QUESTION_MARK &&
    0          
    0          
5751 0           ptr[3] == CHAR_NUMBER_SIGN)
5752             {
5753 0           ptr += 4;
5754 0 0         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
    0          
5755 0 0         if (*ptr == CHAR_NULL)
5756             {
5757 0           *errorcodeptr = ERR18;
5758 0           goto FAILED;
5759             }
5760             }
5761              
5762             /* If the next character is '+', we have a possessive quantifier. This
5763             implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5764             If the next character is '?' this is a minimizing repeat, by default,
5765             but if PCRE_UNGREEDY is set, it works the other way round. We change the
5766             repeat type to the non-default. */
5767              
5768 88 50         if (ptr[1] == CHAR_PLUS)
5769             {
5770 0           repeat_type = 0; /* Force greedy */
5771 0           possessive_quantifier = TRUE;
5772 0           ptr++;
5773             }
5774 88 50         else if (ptr[1] == CHAR_QUESTION_MARK)
5775             {
5776 0           repeat_type = greedy_non_default;
5777 0           ptr++;
5778             }
5779 88           else repeat_type = greedy_default;
5780              
5781             /* If previous was a recursion call, wrap it in atomic brackets so that
5782             previous becomes the atomic group. All recursions were so wrapped in the
5783             past, but it no longer happens for non-repeated recursions. In fact, the
5784             repeated ones could be re-implemented independently so as not to need this,
5785             but for the moment we rely on the code for repeating groups. */
5786              
5787 88 50         if (*previous == OP_RECURSE)
5788             {
5789 0           memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5790 0           *previous = OP_ONCE;
5791 0           PUT(previous, 1, 2 + 2*LINK_SIZE);
5792 0           previous[2 + 2*LINK_SIZE] = OP_KET;
5793 0           PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5794 0           code += 2 + 2 * LINK_SIZE;
5795 0           length_prevgroup = 3 + 3*LINK_SIZE;
5796              
5797             /* When actually compiling, we need to check whether this was a forward
5798             reference, and if so, adjust the offset. */
5799              
5800 0 0         if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
    0          
5801             {
5802 0           int offset = GET(cd->hwm, -LINK_SIZE);
5803 0 0         if (offset == previous + 1 - cd->start_code)
5804 0           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5805             }
5806             }
5807              
5808             /* Now handle repetition for the different types of item. */
5809              
5810             /* If previous was a character or negated character match, abolish the item
5811             and generate a repeat item instead. If a char item has a minimum of more
5812             than one, ensure that it is set in reqchar - it might not be if a sequence
5813             such as x{3} is the first thing in a branch because the x will have gone
5814             into firstchar instead. */
5815              
5816 88 50         if (*previous == OP_CHAR || *previous == OP_CHARI
    50          
5817 88 50         || *previous == OP_NOT || *previous == OP_NOTI)
    50          
5818             {
5819 0           switch (*previous)
5820             {
5821             default: /* Make compiler happy. */
5822 0           case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5823 0           case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5824 0           case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5825 0           case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5826             }
5827              
5828             /* Deal with UTF characters that take up more than one character. It's
5829             easier to write this out separately than try to macrify it. Use c to
5830             hold the length of the character in bytes, plus UTF_LENGTH to flag that
5831             it's a length rather than a small character. */
5832              
5833             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5834             if (utf && NOT_FIRSTCHAR(code[-1]))
5835             {
5836             pcre_uchar *lastchar = code - 1;
5837             BACKCHAR(lastchar);
5838             c = (int)(code - lastchar); /* Length of UTF-8 character */
5839             memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5840             c |= UTF_LENGTH; /* Flag c as a length */
5841             }
5842             else
5843             #endif /* SUPPORT_UTF */
5844              
5845             /* Handle the case of a single charater - either with no UTF support, or
5846             with UTF disabled, or for a single character UTF character. */
5847             {
5848 0           c = code[-1];
5849 0 0         if (*previous <= OP_CHARI && repeat_min > 1)
    0          
5850             {
5851 0           reqchar = c;
5852 0           reqcharflags = req_caseopt | cd->req_varyopt;
5853             }
5854             }
5855              
5856 0           goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5857             }
5858              
5859             /* If previous was a character type match (\d or similar), abolish it and
5860             create a suitable repeat item. The code is shared with single-character
5861             repeats by setting op_type to add a suitable offset into repeat_type. Note
5862             the the Unicode property types will be present only when SUPPORT_UCP is
5863             defined, but we don't wrap the little bits of code here because it just
5864             makes it horribly messy. */
5865              
5866 88 100         else if (*previous < OP_EODN)
5867             {
5868             pcre_uchar *oldcode;
5869             int prop_type, prop_value;
5870 82           op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5871 82           c = *previous;
5872              
5873             OUTPUT_SINGLE_REPEAT:
5874 82 50         if (*previous == OP_PROP || *previous == OP_NOTPROP)
    50          
5875             {
5876 0           prop_type = previous[1];
5877 0           prop_value = previous[2];
5878             }
5879 82           else prop_type = prop_value = -1;
5880              
5881 82           oldcode = code;
5882 82           code = previous; /* Usually overwrite previous item */
5883              
5884             /* If the maximum is zero then the minimum must also be zero; Perl allows
5885             this case, so we do too - by simply omitting the item altogether. */
5886              
5887 82 50         if (repeat_max == 0) goto END_REPEAT;
5888              
5889             /* Combine the op_type with the repeat_type */
5890              
5891 82           repeat_type += op_type;
5892              
5893             /* A minimum of zero is handled either as the special case * or ?, or as
5894             an UPTO, with the maximum given. */
5895              
5896 82 100         if (repeat_min == 0)
5897             {
5898 72 50         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5899 0 0         else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5900             else
5901             {
5902 0           *code++ = OP_UPTO + repeat_type;
5903 72           PUT2INC(code, 0, repeat_max);
5904             }
5905             }
5906              
5907             /* A repeat minimum of 1 is optimized into some special cases. If the
5908             maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5909             left in place and, if the maximum is greater than 1, we use OP_UPTO with
5910             one less than the maximum. */
5911              
5912 10 50         else if (repeat_min == 1)
5913             {
5914 10 50         if (repeat_max == -1)
5915 10           *code++ = OP_PLUS + repeat_type;
5916             else
5917             {
5918 0           code = oldcode; /* leave previous item in place */
5919 0 0         if (repeat_max == 1) goto END_REPEAT;
5920 0           *code++ = OP_UPTO + repeat_type;
5921 10           PUT2INC(code, 0, repeat_max - 1);
5922             }
5923             }
5924              
5925             /* The case {n,n} is just an EXACT, while the general case {n,m} is
5926             handled as an EXACT followed by an UPTO. */
5927              
5928             else
5929             {
5930 0           *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5931 0           PUT2INC(code, 0, repeat_min);
5932              
5933             /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5934             we have to insert the character for the previous code. For a repeated
5935             Unicode property match, there are two extra bytes that define the
5936             required property. In UTF-8 mode, long characters have their length in
5937             c, with the UTF_LENGTH bit as a flag. */
5938              
5939 0 0         if (repeat_max < 0)
5940             {
5941             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5942             if (utf && (c & UTF_LENGTH) != 0)
5943             {
5944             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5945             code += c & 7;
5946             }
5947             else
5948             #endif
5949             {
5950 0           *code++ = c;
5951 0 0         if (prop_type >= 0)
5952             {
5953 0           *code++ = prop_type;
5954 0           *code++ = prop_value;
5955             }
5956             }
5957 0           *code++ = OP_STAR + repeat_type;
5958             }
5959              
5960             /* Else insert an UPTO if the max is greater than the min, again
5961             preceded by the character, for the previously inserted code. If the
5962             UPTO is just for 1 instance, we can use QUERY instead. */
5963              
5964 0 0         else if (repeat_max != repeat_min)
5965             {
5966             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5967             if (utf && (c & UTF_LENGTH) != 0)
5968             {
5969             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5970             code += c & 7;
5971             }
5972             else
5973             #endif
5974 0           *code++ = c;
5975 0 0         if (prop_type >= 0)
5976             {
5977 0           *code++ = prop_type;
5978 0           *code++ = prop_value;
5979             }
5980 0           repeat_max -= repeat_min;
5981              
5982 0 0         if (repeat_max == 1)
5983             {
5984 0           *code++ = OP_QUERY + repeat_type;
5985             }
5986             else
5987             {
5988 0           *code++ = OP_UPTO + repeat_type;
5989 0           PUT2INC(code, 0, repeat_max);
5990             }
5991             }
5992             }
5993              
5994             /* The character or character type itself comes last in all cases. */
5995              
5996             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5997             if (utf && (c & UTF_LENGTH) != 0)
5998             {
5999             memcpy(code, utf_chars, IN_UCHARS(c & 7));
6000             code += c & 7;
6001             }
6002             else
6003             #endif
6004 82           *code++ = c;
6005              
6006             /* For a repeated Unicode property match, there are two extra bytes that
6007             define the required property. */
6008              
6009             #ifdef SUPPORT_UCP
6010             if (prop_type >= 0)
6011             {
6012             *code++ = prop_type;
6013             *code++ = prop_value;
6014             }
6015             #endif
6016             }
6017              
6018             /* If previous was a character class or a back reference, we put the repeat
6019             stuff after it, but just skip the item if the repeat was {0,0}. */
6020              
6021 6 50         else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
    50          
    50          
6022             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6023             *previous == OP_XCLASS ||
6024             #endif
6025 6 50         *previous == OP_REF || *previous == OP_REFI ||
    50          
6026 6 50         *previous == OP_DNREF || *previous == OP_DNREFI)
6027             {
6028 0 0         if (repeat_max == 0)
6029             {
6030 0           code = previous;
6031 0           goto END_REPEAT;
6032             }
6033              
6034 0 0         if (repeat_min == 0 && repeat_max == -1)
    0          
6035 0           *code++ = OP_CRSTAR + repeat_type;
6036 0 0         else if (repeat_min == 1 && repeat_max == -1)
    0          
6037 0           *code++ = OP_CRPLUS + repeat_type;
6038 0 0         else if (repeat_min == 0 && repeat_max == 1)
    0          
6039 0           *code++ = OP_CRQUERY + repeat_type;
6040             else
6041             {
6042 0           *code++ = OP_CRRANGE + repeat_type;
6043 0           PUT2INC(code, 0, repeat_min);
6044 0 0         if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
6045 0           PUT2INC(code, 0, repeat_max);
6046             }
6047             }
6048              
6049             /* If previous was a bracket group, we may have to replicate it in certain
6050             cases. Note that at this point we can encounter only the "basic" bracket
6051             opcodes such as BRA and CBRA, as this is the place where they get converted
6052             into the more special varieties such as BRAPOS and SBRA. A test for >=
6053             OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6054             ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6055             Originally, PCRE did not allow repetition of assertions, but now it does,
6056             for Perl compatibility. */
6057              
6058 6 50         else if (*previous >= OP_ASSERT && *previous <= OP_COND)
    50          
6059 6           {
6060             register int i;
6061 6           int len = (int)(code - previous);
6062 6           size_t base_hwm_offset = item_hwm_offset;
6063 6           pcre_uchar *bralink = NULL;
6064 6           pcre_uchar *brazeroptr = NULL;
6065              
6066             /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6067             we just ignore the repeat. */
6068              
6069 6 50         if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
    0          
6070 0           goto END_REPEAT;
6071              
6072             /* There is no sense in actually repeating assertions. The only potential
6073             use of repetition is in cases when the assertion is optional. Therefore,
6074             if the minimum is greater than zero, just ignore the repeat. If the
6075             maximum is not zero or one, set it to 1. */
6076              
6077 6 50         if (*previous < OP_ONCE) /* Assertion */
6078             {
6079 0 0         if (repeat_min > 0) goto END_REPEAT;
6080 0 0         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
    0          
6081             }
6082              
6083             /* The case of a zero minimum is special because of the need to stick
6084             OP_BRAZERO in front of it, and because the group appears once in the
6085             data, whereas in other cases it appears the minimum number of times. For
6086             this reason, it is simplest to treat this case separately, as otherwise
6087             the code gets far too messy. There are several special subcases when the
6088             minimum is zero. */
6089              
6090 6 50         if (repeat_min == 0)
6091             {
6092             /* If the maximum is also zero, we used to just omit the group from the
6093             output altogether, like this:
6094              
6095             ** if (repeat_max == 0)
6096             ** {
6097             ** code = previous;
6098             ** goto END_REPEAT;
6099             ** }
6100              
6101             However, that fails when a group or a subgroup within it is referenced
6102             as a subroutine from elsewhere in the pattern, so now we stick in
6103             OP_SKIPZERO in front of it so that it is skipped on execution. As we
6104             don't have a list of which groups are referenced, we cannot do this
6105             selectively.
6106              
6107             If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6108             and do no more at this point. However, we do need to adjust any
6109             OP_RECURSE calls inside the group that refer to the group itself or any
6110             internal or forward referenced group, because the offset is from the
6111             start of the whole regex. Temporarily terminate the pattern while doing
6112             this. */
6113              
6114 6 50         if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
6115             {
6116 6           *code = OP_END;
6117 6           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6118 6           memmove(previous + 1, previous, IN_UCHARS(len));
6119 6           code++;
6120 6 50         if (repeat_max == 0)
6121             {
6122 0           *previous++ = OP_SKIPZERO;
6123 0           goto END_REPEAT;
6124             }
6125 6           brazeroptr = previous; /* Save for possessive optimizing */
6126 6           *previous++ = OP_BRAZERO + repeat_type;
6127             }
6128              
6129             /* If the maximum is greater than 1 and limited, we have to replicate
6130             in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6131             The first one has to be handled carefully because it's the original
6132             copy, which has to be moved up. The remainder can be handled by code
6133             that is common with the non-zero minimum case below. We have to
6134             adjust the value or repeat_max, since one less copy is required. Once
6135             again, we may have to adjust any OP_RECURSE calls inside the group. */
6136              
6137             else
6138             {
6139             int offset;
6140 0           *code = OP_END;
6141 0           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6142 0           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6143 0           code += 2 + LINK_SIZE;
6144 0           *previous++ = OP_BRAZERO + repeat_type;
6145 0           *previous++ = OP_BRA;
6146              
6147             /* We chain together the bracket offset fields that have to be
6148             filled in later when the ends of the brackets are reached. */
6149              
6150 0 0         offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6151 0           bralink = previous;
6152 0           PUTINC(previous, 0, offset);
6153             }
6154              
6155 6           repeat_max--;
6156             }
6157              
6158             /* If the minimum is greater than zero, replicate the group as many
6159             times as necessary, and adjust the maximum to the number of subsequent
6160             copies that we need. If we set a first char from the group, and didn't
6161             set a required char, copy the latter from the former. If there are any
6162             forward reference subroutine calls in the group, there will be entries on
6163             the workspace list; replicate these with an appropriate increment. */
6164              
6165             else
6166             {
6167 0 0         if (repeat_min > 1)
6168             {
6169             /* In the pre-compile phase, we don't actually do the replication. We
6170             just adjust the length as if we had. Do some paranoid checks for
6171             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6172             integer type when available, otherwise double. */
6173              
6174 0 0         if (lengthptr != NULL)
6175             {
6176 0           int delta = (repeat_min - 1)*length_prevgroup;
6177 0 0         if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6178 0           (INT64_OR_DOUBLE)length_prevgroup >
6179 0 0         (INT64_OR_DOUBLE)INT_MAX ||
6180 0           OFLOW_MAX - *lengthptr < delta)
6181             {
6182 0           *errorcodeptr = ERR20;
6183 0           goto FAILED;
6184             }
6185 0           *lengthptr += delta;
6186             }
6187              
6188             /* This is compiling for real. If there is a set first byte for
6189             the group, and we have not yet set a "required byte", set it. Make
6190             sure there is enough workspace for copying forward references before
6191             doing the copy. */
6192              
6193             else
6194             {
6195 0 0         if (groupsetfirstchar && reqcharflags < 0)
    0          
6196             {
6197 0           reqchar = firstchar;
6198 0           reqcharflags = firstcharflags;
6199             }
6200              
6201 0 0         for (i = 1; i < repeat_min; i++)
6202             {
6203             pcre_uchar *hc;
6204 0           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6205 0           memcpy(code, previous, IN_UCHARS(len));
6206              
6207 0 0         while (cd->hwm > cd->start_workspace + cd->workspace_size -
6208 0           WORK_SIZE_SAFETY_MARGIN -
6209 0           (this_hwm_offset - base_hwm_offset))
6210             {
6211 0           *errorcodeptr = expand_workspace(cd);
6212 0 0         if (*errorcodeptr != 0) goto FAILED;
6213             }
6214              
6215 0 0         for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6216 0           hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6217 0           hc += LINK_SIZE)
6218             {
6219 0           PUT(cd->hwm, 0, GET(hc, 0) + len);
6220 0           cd->hwm += LINK_SIZE;
6221             }
6222 0           base_hwm_offset = this_hwm_offset;
6223 0           code += len;
6224             }
6225             }
6226             }
6227              
6228 0 0         if (repeat_max > 0) repeat_max -= repeat_min;
6229             }
6230              
6231             /* This code is common to both the zero and non-zero minimum cases. If
6232             the maximum is limited, it replicates the group in a nested fashion,
6233             remembering the bracket starts on a stack. In the case of a zero minimum,
6234             the first one was set up above. In all cases the repeat_max now specifies
6235             the number of additional copies needed. Again, we must remember to
6236             replicate entries on the forward reference list. */
6237              
6238 6 50         if (repeat_max >= 0)
6239             {
6240             /* In the pre-compile phase, we don't actually do the replication. We
6241             just adjust the length as if we had. For each repetition we must add 1
6242             to the length for BRAZERO and for all but the last repetition we must
6243             add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6244             paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6245             a 64-bit integer type when available, otherwise double. */
6246              
6247 6 100         if (lengthptr != NULL && repeat_max > 0)
    50          
6248 0           {
6249 0           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6250             2 - 2*LINK_SIZE; /* Last one doesn't nest */
6251 0 0         if ((INT64_OR_DOUBLE)repeat_max *
6252 0           (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6253 0 0         > (INT64_OR_DOUBLE)INT_MAX ||
6254 0           OFLOW_MAX - *lengthptr < delta)
6255             {
6256 0           *errorcodeptr = ERR20;
6257 0           goto FAILED;
6258             }
6259 0           *lengthptr += delta;
6260             }
6261              
6262             /* This is compiling for real */
6263              
6264 6 50         else for (i = repeat_max - 1; i >= 0; i--)
6265             {
6266             pcre_uchar *hc;
6267 0           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6268              
6269 0           *code++ = OP_BRAZERO + repeat_type;
6270              
6271             /* All but the final copy start a new nesting, maintaining the
6272             chain of brackets outstanding. */
6273              
6274 0 0         if (i != 0)
6275             {
6276             int offset;
6277 0           *code++ = OP_BRA;
6278 0 0         offset = (bralink == NULL)? 0 : (int)(code - bralink);
6279 0           bralink = code;
6280 0           PUTINC(code, 0, offset);
6281             }
6282              
6283 0           memcpy(code, previous, IN_UCHARS(len));
6284              
6285             /* Ensure there is enough workspace for forward references before
6286             copying them. */
6287              
6288 0 0         while (cd->hwm > cd->start_workspace + cd->workspace_size -
6289 0           WORK_SIZE_SAFETY_MARGIN -
6290 0           (this_hwm_offset - base_hwm_offset))
6291             {
6292 0           *errorcodeptr = expand_workspace(cd);
6293 0 0         if (*errorcodeptr != 0) goto FAILED;
6294             }
6295              
6296 0 0         for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6297 0           hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6298 0           hc += LINK_SIZE)
6299             {
6300 0 0         PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
    0          
6301 0           cd->hwm += LINK_SIZE;
6302             }
6303 0           base_hwm_offset = this_hwm_offset;
6304 0           code += len;
6305             }
6306              
6307             /* Now chain through the pending brackets, and fill in their length
6308             fields (which are holding the chain links pro tem). */
6309              
6310 6 50         while (bralink != NULL)
6311             {
6312             int oldlinkoffset;
6313 0           int offset = (int)(code - bralink + 1);
6314 0           pcre_uchar *bra = code - offset;
6315 0           oldlinkoffset = GET(bra, 1);
6316 0 0         bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6317 0           *code++ = OP_KET;
6318 0           PUTINC(code, 0, offset);
6319 0           PUT(bra, 1, offset);
6320             }
6321             }
6322              
6323             /* If the maximum is unlimited, set a repeater in the final copy. For
6324             ONCE brackets, that's all we need to do. However, possessively repeated
6325             ONCE brackets can be converted into non-capturing brackets, as the
6326             behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6327             deal with possessive ONCEs specially.
6328              
6329             Otherwise, when we are doing the actual compile phase, check to see
6330             whether this group is one that could match an empty string. If so,
6331             convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6332             that runtime checking can be done. [This check is also applied to ONCE
6333             groups at runtime, but in a different way.]
6334              
6335             Then, if the quantifier was possessive and the bracket is not a
6336             conditional, we convert the BRA code to the POS form, and the KET code to
6337             KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6338             subpattern at both the start and at the end.) The use of special opcodes
6339             makes it possible to reduce greatly the stack usage in pcre_exec(). If
6340             the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6341              
6342             Then, if the minimum number of matches is 1 or 0, cancel the possessive
6343             flag so that the default action below, of wrapping everything inside
6344             atomic brackets, does not happen. When the minimum is greater than 1,
6345             there will be earlier copies of the group, and so we still have to wrap
6346             the whole thing. */
6347              
6348             else
6349             {
6350 0           pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6351 0           pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6352              
6353             /* Convert possessive ONCE brackets to non-capturing */
6354              
6355 0 0         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
    0          
    0          
6356 0           possessive_quantifier) *bracode = OP_BRA;
6357              
6358             /* For non-possessive ONCE brackets, all we need to do is to
6359             set the KET. */
6360              
6361 0 0         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
    0          
6362 0           *ketcode = OP_KETRMAX + repeat_type;
6363              
6364             /* Handle non-ONCE brackets and possessive ONCEs (which have been
6365             converted to non-capturing above). */
6366              
6367             else
6368             {
6369             /* In the compile phase, check for empty string matching. */
6370              
6371 0 0         if (lengthptr == NULL)
6372             {
6373 0           pcre_uchar *scode = bracode;
6374             do
6375             {
6376 0 0         if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6377             {
6378 0           *bracode += OP_SBRA - OP_BRA;
6379 0           break;
6380             }
6381 0           scode += GET(scode, 1);
6382             }
6383 0 0         while (*scode == OP_ALT);
6384             }
6385              
6386             /* A conditional group with only one branch has an implicit empty
6387             alternative branch. */
6388              
6389 0 0         if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
    0          
6390 0           *bracode = OP_SCOND;
6391              
6392             /* Handle possessive quantifiers. */
6393              
6394 0 0         if (possessive_quantifier)
6395             {
6396             /* For COND brackets, we wrap the whole thing in a possessively
6397             repeated non-capturing bracket, because we have not invented POS
6398             versions of the COND opcodes. Because we are moving code along, we
6399             must ensure that any pending recursive references are updated. */
6400              
6401 0 0         if (*bracode == OP_COND || *bracode == OP_SCOND)
    0          
6402 0           {
6403 0           int nlen = (int)(code - bracode);
6404 0           *code = OP_END;
6405 0           adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6406 0           memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6407 0           code += 1 + LINK_SIZE;
6408 0           nlen += 1 + LINK_SIZE;
6409 0 0         *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6410 0           *code++ = OP_KETRPOS;
6411 0           PUTINC(code, 0, nlen);
6412 0           PUT(bracode, 1, nlen);
6413             }
6414              
6415             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6416              
6417             else
6418             {
6419 0           *bracode += 1; /* Switch to xxxPOS opcodes */
6420 0           *ketcode = OP_KETRPOS;
6421             }
6422              
6423             /* If the minimum is zero, mark it as possessive, then unset the
6424             possessive flag when the minimum is 0 or 1. */
6425              
6426 0 0         if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6427 0 0         if (repeat_min < 2) possessive_quantifier = FALSE;
6428             }
6429              
6430             /* Non-possessive quantifier */
6431              
6432 0           else *ketcode = OP_KETRMAX + repeat_type;
6433             }
6434             }
6435             }
6436              
6437             /* If previous is OP_FAIL, it was generated by an empty class [] in
6438             JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6439             by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6440             error above. We can just ignore the repeat in JS case. */
6441              
6442 0 0         else if (*previous == OP_FAIL) goto END_REPEAT;
6443              
6444             /* Else there's some kind of shambles */
6445              
6446             else
6447             {
6448 0           *errorcodeptr = ERR11;
6449 0           goto FAILED;
6450             }
6451              
6452             /* If the character following a repeat is '+', possessive_quantifier is
6453             TRUE. For some opcodes, there are special alternative opcodes for this
6454             case. For anything else, we wrap the entire repeated item inside OP_ONCE
6455             brackets. Logically, the '+' notation is just syntactic sugar, taken from
6456             Sun's Java package, but the special opcodes can optimize it.
6457              
6458             Some (but not all) possessively repeated subpatterns have already been
6459             completely handled in the code just above. For them, possessive_quantifier
6460             is always FALSE at this stage. Note that the repeated item starts at
6461             tempcode, not at previous, which might be the first part of a string whose
6462             (former) last char we repeated. */
6463              
6464 88 50         if (possessive_quantifier)
6465             {
6466             int len;
6467              
6468             /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6469             However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6470             {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6471             remains is greater than zero, there's a further opcode that can be
6472             handled. If not, do nothing, leaving the EXACT alone. */
6473              
6474 0           switch(*tempcode)
6475             {
6476             case OP_TYPEEXACT:
6477 0           tempcode += PRIV(OP_lengths)[*tempcode] +
6478 0           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6479 0 0         || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
    0          
6480 0           break;
6481              
6482             /* CHAR opcodes are used for exacts whose count is 1. */
6483              
6484             case OP_CHAR:
6485             case OP_CHARI:
6486             case OP_NOT:
6487             case OP_NOTI:
6488             case OP_EXACT:
6489             case OP_EXACTI:
6490             case OP_NOTEXACT:
6491             case OP_NOTEXACTI:
6492 0           tempcode += PRIV(OP_lengths)[*tempcode];
6493             #ifdef SUPPORT_UTF
6494             if (utf && HAS_EXTRALEN(tempcode[-1]))
6495             tempcode += GET_EXTRALEN(tempcode[-1]);
6496             #endif
6497 0           break;
6498              
6499             /* For the class opcodes, the repeat operator appears at the end;
6500             adjust tempcode to point to it. */
6501              
6502             case OP_CLASS:
6503             case OP_NCLASS:
6504 0           tempcode += 1 + 32/sizeof(pcre_uchar);
6505 0           break;
6506              
6507             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6508             case OP_XCLASS:
6509             tempcode += GET(tempcode, 1);
6510             break;
6511             #endif
6512             }
6513              
6514             /* If tempcode is equal to code (which points to the end of the repeated
6515             item), it means we have skipped an EXACT item but there is no following
6516             QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6517             all other cases, tempcode will be pointing to the repeat opcode, and will
6518             be less than code, so the value of len will be greater than 0. */
6519              
6520 0           len = (int)(code - tempcode);
6521 0 0         if (len > 0)
6522             {
6523 0           unsigned int repcode = *tempcode;
6524              
6525             /* There is a table for possessifying opcodes, all of which are less
6526             than OP_CALLOUT. A zero entry means there is no possessified version.
6527             */
6528              
6529 0 0         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
    0          
6530 0           *tempcode = opcode_possessify[repcode];
6531              
6532             /* For opcode without a special possessified version, wrap the item in
6533             ONCE brackets. Because we are moving code along, we must ensure that any
6534             pending recursive references are updated. */
6535              
6536             else
6537             {
6538 0           *code = OP_END;
6539 0           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6540 0           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6541 0           code += 1 + LINK_SIZE;
6542 0           len += 1 + LINK_SIZE;
6543 0           tempcode[0] = OP_ONCE;
6544 0           *code++ = OP_KET;
6545 0           PUTINC(code, 0, len);
6546 0           PUT(tempcode, 1, len);
6547             }
6548             }
6549              
6550             #ifdef NEVER
6551             if (len > 0) switch (*tempcode)
6552             {
6553             case OP_STAR: *tempcode = OP_POSSTAR; break;
6554             case OP_PLUS: *tempcode = OP_POSPLUS; break;
6555             case OP_QUERY: *tempcode = OP_POSQUERY; break;
6556             case OP_UPTO: *tempcode = OP_POSUPTO; break;
6557              
6558             case OP_STARI: *tempcode = OP_POSSTARI; break;
6559             case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6560             case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6561             case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6562              
6563             case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6564             case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6565             case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6566             case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6567              
6568             case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6569             case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6570             case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6571             case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6572              
6573             case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6574             case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6575             case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6576             case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6577              
6578             case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6579             case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6580             case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6581             case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6582              
6583             /* Because we are moving code along, we must ensure that any
6584             pending recursive references are updated. */
6585              
6586             default:
6587             *code = OP_END;
6588             adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6589             memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6590             code += 1 + LINK_SIZE;
6591             len += 1 + LINK_SIZE;
6592             tempcode[0] = OP_ONCE;
6593             *code++ = OP_KET;
6594             PUTINC(code, 0, len);
6595             PUT(tempcode, 1, len);
6596             break;
6597             }
6598             #endif
6599             }
6600              
6601             /* In all case we no longer have a previous item. We also set the
6602             "follows varying string" flag for subsequently encountered reqchars if
6603             it isn't already set and we have just passed a varying length item. */
6604              
6605             END_REPEAT:
6606 88           previous = NULL;
6607 88           cd->req_varyopt |= reqvary;
6608 88           break;
6609              
6610              
6611             /* ===================================================================*/
6612             /* Start of nested parenthesized sub-expression, or comment or lookahead or
6613             lookbehind or option setting or condition or all the other extended
6614             parenthesis forms. */
6615              
6616             case CHAR_LEFT_PARENTHESIS:
6617 6           ptr++;
6618              
6619             /* Now deal with various "verbs" that can be introduced by '*'. */
6620              
6621 6 50         if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
    0          
6622 0 0         || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6623             {
6624             int i, namelen;
6625 0           int arglen = 0;
6626 0           const char *vn = verbnames;
6627 0           const pcre_uchar *name = ptr + 1;
6628 0           const pcre_uchar *arg = NULL;
6629 0           previous = NULL;
6630 0           ptr++;
6631 0 0         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6632 0           namelen = (int)(ptr - name);
6633              
6634             /* It appears that Perl allows any characters whatsoever, other than
6635             a closing parenthesis, to appear in arguments, so we no longer insist on
6636             letters, digits, and underscores. */
6637              
6638 0 0         if (*ptr == CHAR_COLON)
6639             {
6640 0           arg = ++ptr;
6641 0 0         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
    0          
6642 0           arglen = (int)(ptr - arg);
6643 0 0         if ((unsigned int)arglen > MAX_MARK)
6644             {
6645 0           *errorcodeptr = ERR75;
6646 0           goto FAILED;
6647             }
6648             }
6649              
6650 0 0         if (*ptr != CHAR_RIGHT_PARENTHESIS)
6651             {
6652 0           *errorcodeptr = ERR60;
6653 0           goto FAILED;
6654             }
6655              
6656             /* Scan the table of verb names */
6657              
6658 0 0         for (i = 0; i < verbcount; i++)
6659             {
6660 0 0         if (namelen == verbs[i].len &&
    0          
6661 0           STRNCMP_UC_C8(name, vn, namelen) == 0)
6662             {
6663             int setverb;
6664              
6665             /* Check for open captures before ACCEPT and convert it to
6666             ASSERT_ACCEPT if in an assertion. */
6667              
6668 0 0         if (verbs[i].op == OP_ACCEPT)
6669             {
6670             open_capitem *oc;
6671 0 0         if (arglen != 0)
6672             {
6673 0           *errorcodeptr = ERR59;
6674 0           goto FAILED;
6675             }
6676 0           cd->had_accept = TRUE;
6677 0 0         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6678             {
6679 0 0         if (lengthptr != NULL)
6680             {
6681             #ifdef COMPILE_PCRE8
6682 0           *lengthptr += 1 + IMM2_SIZE;
6683             #elif defined COMPILE_PCRE16
6684             *lengthptr += 2 + IMM2_SIZE;
6685             #elif defined COMPILE_PCRE32
6686             *lengthptr += 4 + IMM2_SIZE;
6687             #endif
6688             }
6689             else
6690             {
6691 0           *code++ = OP_CLOSE;
6692 0           PUT2INC(code, 0, oc->number);
6693             }
6694             }
6695 0 0         setverb = *code++ =
6696 0           (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6697              
6698             /* Do not set firstchar after *ACCEPT */
6699 0 0         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6700             }
6701              
6702             /* Handle other cases with/without an argument */
6703              
6704 0 0         else if (arglen == 0)
6705             {
6706 0 0         if (verbs[i].op < 0) /* Argument is mandatory */
6707             {
6708 0           *errorcodeptr = ERR66;
6709 0           goto FAILED;
6710             }
6711 0           setverb = *code++ = verbs[i].op;
6712             }
6713              
6714             else
6715             {
6716 0 0         if (verbs[i].op_arg < 0) /* Argument is forbidden */
6717             {
6718 0           *errorcodeptr = ERR59;
6719 0           goto FAILED;
6720             }
6721 0           setverb = *code++ = verbs[i].op_arg;
6722 0 0         if (lengthptr != NULL) /* In pass 1 just add in the length */
6723             { /* to avoid potential workspace */
6724 0           *lengthptr += arglen; /* overflow. */
6725 0           *code++ = 0;
6726             }
6727             else
6728             {
6729 0           *code++ = arglen;
6730 0           memcpy(code, arg, IN_UCHARS(arglen));
6731 0           code += arglen;
6732             }
6733 0           *code++ = 0;
6734             }
6735              
6736 0           switch (setverb)
6737             {
6738             case OP_THEN:
6739             case OP_THEN_ARG:
6740 0           cd->external_flags |= PCRE_HASTHEN;
6741 0           break;
6742              
6743             case OP_PRUNE:
6744             case OP_PRUNE_ARG:
6745             case OP_SKIP:
6746             case OP_SKIP_ARG:
6747 0           cd->had_pruneorskip = TRUE;
6748 0           break;
6749             }
6750              
6751 0           break; /* Found verb, exit loop */
6752             }
6753              
6754 0           vn += verbs[i].len + 1;
6755             }
6756              
6757 0 0         if (i < verbcount) continue; /* Successfully handled a verb */
6758 0           *errorcodeptr = ERR60; /* Verb not recognized */
6759 0           goto FAILED;
6760             }
6761              
6762             /* Initialize for "real" parentheses */
6763              
6764 6           newoptions = options;
6765 6           skipbytes = 0;
6766 6           bravalue = OP_CBRA;
6767 6           item_hwm_offset = cd->hwm - cd->start_workspace;
6768 6           reset_bracount = FALSE;
6769              
6770             /* Deal with the extended parentheses; all are introduced by '?', and the
6771             appearance of any of them means that this is not a capturing group. */
6772              
6773 6 50         if (*ptr == CHAR_QUESTION_MARK)
6774             {
6775             int i, set, unset, namelen;
6776             int *optset;
6777             const pcre_uchar *name;
6778             pcre_uchar *slot;
6779              
6780 0           switch (*(++ptr))
6781             {
6782             /* ------------------------------------------------------------ */
6783             case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6784 0           reset_bracount = TRUE;
6785 0           cd->dupgroups = TRUE; /* Record (?| encountered */
6786             /* Fall through */
6787              
6788             /* ------------------------------------------------------------ */
6789             case CHAR_COLON: /* Non-capturing bracket */
6790 0           bravalue = OP_BRA;
6791 0           ptr++;
6792 0           break;
6793              
6794              
6795             /* ------------------------------------------------------------ */
6796             case CHAR_LEFT_PARENTHESIS:
6797 0           bravalue = OP_COND; /* Conditional group */
6798 0           tempptr = ptr;
6799              
6800             /* A condition can be an assertion, a number (referring to a numbered
6801             group's having been set), a name (referring to a named group), or 'R',
6802             referring to recursion. R and R&name are also permitted for
6803             recursion tests.
6804              
6805             There are ways of testing a named group: (?(name)) is used by Python;
6806             Perl 5.10 onwards uses (?() or (?('name')).
6807              
6808             There is one unfortunate ambiguity, caused by history. 'R' can be the
6809             recursive thing or the name 'R' (and similarly for 'R' followed by
6810             digits). We look for a name first; if not found, we try the other case.
6811              
6812             For compatibility with auto-callouts, we allow a callout to be
6813             specified before a condition that is an assertion. First, check for the
6814             syntax of a callout; if found, adjust the temporary pointer that is
6815             used to check for an assertion condition. That's all that is needed! */
6816              
6817 0 0         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
    0          
6818             {
6819 0 0         for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
    0          
6820 0 0         if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6821 0           tempptr += i + 1;
6822              
6823             /* tempptr should now be pointing to the opening parenthesis of the
6824             assertion condition. */
6825              
6826 0 0         if (*tempptr != CHAR_LEFT_PARENTHESIS)
6827             {
6828 0           *errorcodeptr = ERR28;
6829 0           goto FAILED;
6830             }
6831             }
6832              
6833             /* For conditions that are assertions, check the syntax, and then exit
6834             the switch. This will take control down to where bracketed groups,
6835             including assertions, are processed. */
6836              
6837 0 0         if (tempptr[1] == CHAR_QUESTION_MARK &&
    0          
6838 0 0         (tempptr[2] == CHAR_EQUALS_SIGN ||
6839 0 0         tempptr[2] == CHAR_EXCLAMATION_MARK ||
6840 0 0         (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6841 0 0         (tempptr[3] == CHAR_EQUALS_SIGN ||
6842 0           tempptr[3] == CHAR_EXCLAMATION_MARK))))
6843             {
6844 0           cd->iscondassert = TRUE;
6845 0           break;
6846             }
6847              
6848             /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6849             need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6850              
6851 0           code[1+LINK_SIZE] = OP_CREF;
6852 0           skipbytes = 1+IMM2_SIZE;
6853 0           refsign = -1; /* => not a number */
6854 0           namelen = -1; /* => not a name; must set to avoid warning */
6855 0           name = NULL; /* Always set to avoid warning */
6856 0           recno = 0; /* Always set to avoid warning */
6857              
6858             /* Check for a test for recursion in a named group. */
6859              
6860 0           ptr++;
6861 0 0         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
    0          
6862             {
6863 0           terminator = -1;
6864 0           ptr += 2;
6865 0           code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6866             }
6867              
6868             /* Check for a test for a named group's having been set, using the Perl
6869             syntax (?() or (?('name'), and also allow for the original PCRE
6870             syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6871              
6872 0 0         else if (*ptr == CHAR_LESS_THAN_SIGN)
6873             {
6874 0           terminator = CHAR_GREATER_THAN_SIGN;
6875 0           ptr++;
6876             }
6877 0 0         else if (*ptr == CHAR_APOSTROPHE)
6878             {
6879 0           terminator = CHAR_APOSTROPHE;
6880 0           ptr++;
6881             }
6882             else
6883             {
6884 0           terminator = CHAR_NULL;
6885 0 0         if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
    0          
6886 0 0         else if (IS_DIGIT(*ptr)) refsign = 0;
    0          
6887             }
6888              
6889             /* Handle a number */
6890              
6891 0 0         if (refsign >= 0)
6892             {
6893 0 0         while (IS_DIGIT(*ptr))
    0          
6894             {
6895 0 0         if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6896             {
6897 0 0         while (IS_DIGIT(*ptr)) ptr++;
    0          
6898 0           *errorcodeptr = ERR61;
6899 0           goto FAILED;
6900             }
6901 0           recno = recno * 10 + (int)(*ptr - CHAR_0);
6902 0           ptr++;
6903             }
6904             }
6905              
6906             /* Otherwise we expect to read a name; anything else is an error. When
6907             a name is one of a number of duplicates, a different opcode is used and
6908             it needs more memory. Unfortunately we cannot tell whether a name is a
6909             duplicate in the first pass, so we have to allow for more memory. */
6910              
6911             else
6912             {
6913 0 0         if (IS_DIGIT(*ptr))
    0          
6914             {
6915 0           *errorcodeptr = ERR84;
6916 0           goto FAILED;
6917             }
6918 0 0         if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6919             {
6920 0           *errorcodeptr = ERR28; /* Assertion expected */
6921 0           goto FAILED;
6922             }
6923 0           name = ptr++;
6924 0 0         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6925             {
6926 0           ptr++;
6927             }
6928 0           namelen = (int)(ptr - name);
6929 0 0         if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6930             }
6931              
6932             /* Check the terminator */
6933              
6934 0 0         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
    0          
    0          
6935 0           *ptr++ != CHAR_RIGHT_PARENTHESIS)
6936             {
6937 0           ptr--; /* Error offset */
6938 0           *errorcodeptr = ERR26; /* Malformed number or name */
6939 0           goto FAILED;
6940             }
6941              
6942             /* Do no further checking in the pre-compile phase. */
6943              
6944 0 0         if (lengthptr != NULL) break;
6945              
6946             /* In the real compile we do the work of looking for the actual
6947             reference. If refsign is not negative, it means we have a number in
6948             recno. */
6949              
6950 0 0         if (refsign >= 0)
6951             {
6952 0 0         if (recno <= 0)
6953             {
6954 0           *errorcodeptr = ERR35;
6955 0           goto FAILED;
6956             }
6957 0 0         if (refsign != 0) recno = (refsign == CHAR_MINUS)?
    0          
6958 0           cd->bracount - recno + 1 : recno + cd->bracount;
6959 0 0         if (recno <= 0 || recno > cd->final_bracount)
    0          
6960             {
6961 0           *errorcodeptr = ERR15;
6962 0           goto FAILED;
6963             }
6964 0           PUT2(code, 2+LINK_SIZE, recno);
6965 0 0         if (recno > cd->top_backref) cd->top_backref = recno;
6966 0           break;
6967             }
6968              
6969             /* Otherwise look for the name. */
6970              
6971 0           slot = cd->name_table;
6972 0 0         for (i = 0; i < cd->names_found; i++)
6973             {
6974 0 0         if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
    0          
6975 0           slot[IMM2_SIZE+namelen] == 0) break;
6976 0           slot += cd->name_entry_size;
6977             }
6978              
6979             /* Found the named subpattern. If the name is duplicated, add one to
6980             the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6981             appropriate data values. Otherwise, just insert the unique subpattern
6982             number. */
6983              
6984 0 0         if (i < cd->names_found)
6985             {
6986 0           int offset = i++;
6987 0           int count = 1;
6988 0           recno = GET2(slot, 0); /* Number from first found */
6989 0 0         if (recno > cd->top_backref) cd->top_backref = recno;
6990 0 0         for (; i < cd->names_found; i++)
6991             {
6992 0           slot += cd->name_entry_size;
6993 0 0         if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
    0          
6994 0           (slot+IMM2_SIZE)[namelen] != 0) break;
6995 0           count++;
6996             }
6997              
6998 0 0         if (count > 1)
6999             {
7000 0           PUT2(code, 2+LINK_SIZE, offset);
7001 0           PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
7002 0           skipbytes += IMM2_SIZE;
7003 0           code[1+LINK_SIZE]++;
7004             }
7005             else /* Not a duplicated name */
7006             {
7007 0           PUT2(code, 2+LINK_SIZE, recno);
7008             }
7009             }
7010              
7011             /* If terminator == CHAR_NULL it means that the name followed directly
7012             after the opening parenthesis [e.g. (?(abc)...] and in this case there
7013             are some further alternatives to try. For the cases where terminator !=
7014             CHAR_NULL [things like (?(... or (?('name')... or (?(R&name)... ]
7015             we have now checked all the possibilities, so give an error. */
7016              
7017 0 0         else if (terminator != CHAR_NULL)
7018             {
7019 0           *errorcodeptr = ERR15;
7020 0           goto FAILED;
7021             }
7022              
7023             /* Check for (?(R) for recursion. Allow digits after R to specify a
7024             specific group number. */
7025              
7026 0 0         else if (*name == CHAR_R)
7027             {
7028 0           recno = 0;
7029 0 0         for (i = 1; i < namelen; i++)
7030             {
7031 0 0         if (!IS_DIGIT(name[i]))
    0          
7032             {
7033 0           *errorcodeptr = ERR15;
7034 0           goto FAILED;
7035             }
7036 0 0         if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7037             {
7038 0           *errorcodeptr = ERR61;
7039 0           goto FAILED;
7040             }
7041 0           recno = recno * 10 + name[i] - CHAR_0;
7042             }
7043 0 0         if (recno == 0) recno = RREF_ANY;
7044 0           code[1+LINK_SIZE] = OP_RREF; /* Change test type */
7045 0           PUT2(code, 2+LINK_SIZE, recno);
7046             }
7047              
7048             /* Similarly, check for the (?(DEFINE) "condition", which is always
7049             false. */
7050              
7051 0 0         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
    0          
7052             {
7053 0           code[1+LINK_SIZE] = OP_DEF;
7054 0           skipbytes = 1;
7055             }
7056              
7057             /* Reference to an unidentified subpattern. */
7058              
7059             else
7060             {
7061 0           *errorcodeptr = ERR15;
7062 0           goto FAILED;
7063             }
7064 0           break;
7065              
7066              
7067             /* ------------------------------------------------------------ */
7068             case CHAR_EQUALS_SIGN: /* Positive lookahead */
7069 0           bravalue = OP_ASSERT;
7070 0           cd->assert_depth += 1;
7071 0           ptr++;
7072 0           break;
7073              
7074             /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7075             thing to do, but Perl allows all assertions to be quantified, and when
7076             they contain capturing parentheses there may be a potential use for
7077             this feature. Not that that applies to a quantified (?!) but we allow
7078             it for uniformity. */
7079              
7080             /* ------------------------------------------------------------ */
7081             case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
7082 0           ptr++;
7083 0 0         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
    0          
    0          
7084 0 0         ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
    0          
7085 0 0         (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7086             {
7087 0           *code++ = OP_FAIL;
7088 0           previous = NULL;
7089 0           continue;
7090             }
7091 0           bravalue = OP_ASSERT_NOT;
7092 0           cd->assert_depth += 1;
7093 0           break;
7094              
7095              
7096             /* ------------------------------------------------------------ */
7097             case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
7098 0           switch (ptr[1])
7099             {
7100             case CHAR_EQUALS_SIGN: /* Positive lookbehind */
7101 0           bravalue = OP_ASSERTBACK;
7102 0           cd->assert_depth += 1;
7103 0           ptr += 2;
7104 0           break;
7105              
7106             case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
7107 0           bravalue = OP_ASSERTBACK_NOT;
7108 0           cd->assert_depth += 1;
7109 0           ptr += 2;
7110 0           break;
7111              
7112             default: /* Could be name define, else bad */
7113 0 0         if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7114 0           goto DEFINE_NAME;
7115 0           ptr++; /* Correct offset for error */
7116 0           *errorcodeptr = ERR24;
7117 0           goto FAILED;
7118             }
7119 0           break;
7120              
7121              
7122             /* ------------------------------------------------------------ */
7123             case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
7124 0           bravalue = OP_ONCE;
7125 0           ptr++;
7126 0           break;
7127              
7128              
7129             /* ------------------------------------------------------------ */
7130             case CHAR_C: /* Callout - may be followed by digits; */
7131 0           previous_callout = code; /* Save for later completion */
7132 0           after_manual_callout = 1; /* Skip one item before completing */
7133 0           *code++ = OP_CALLOUT;
7134             {
7135 0           int n = 0;
7136 0           ptr++;
7137 0 0         while(IS_DIGIT(*ptr))
    0          
7138             {
7139 0           n = n * 10 + *ptr++ - CHAR_0;
7140 0 0         if (n > 255)
7141             {
7142 0           *errorcodeptr = ERR38;
7143 0           goto FAILED;
7144             }
7145             }
7146 0 0         if (*ptr != CHAR_RIGHT_PARENTHESIS)
7147             {
7148 0           *errorcodeptr = ERR39;
7149 0           goto FAILED;
7150             }
7151 0           *code++ = n;
7152 0           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7153 0           PUT(code, LINK_SIZE, 0); /* Default length */
7154 0           code += 2 * LINK_SIZE;
7155             }
7156 0           previous = NULL;
7157 0           continue;
7158              
7159              
7160             /* ------------------------------------------------------------ */
7161             case CHAR_P: /* Python-style named subpattern handling */
7162 0 0         if (*(++ptr) == CHAR_EQUALS_SIGN ||
    0          
7163 0           *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
7164             {
7165 0           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7166 0           terminator = CHAR_RIGHT_PARENTHESIS;
7167 0           goto NAMED_REF_OR_RECURSE;
7168             }
7169 0 0         else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
7170             {
7171 0           *errorcodeptr = ERR41;
7172 0           goto FAILED;
7173             }
7174             /* Fall through to handle (?P< as (?< is handled */
7175              
7176              
7177             /* ------------------------------------------------------------ */
7178             DEFINE_NAME: /* Come here from (?< handling */
7179             case CHAR_APOSTROPHE:
7180 0           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7181 0 0         CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7182 0           name = ++ptr;
7183 0 0         if (IS_DIGIT(*ptr))
    0          
7184             {
7185 0           *errorcodeptr = ERR84; /* Group name must start with non-digit */
7186 0           goto FAILED;
7187             }
7188 0 0         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7189 0           namelen = (int)(ptr - name);
7190              
7191             /* In the pre-compile phase, do a syntax check, remember the longest
7192             name, and then remember the group in a vector, expanding it if
7193             necessary. Duplicates for the same number are skipped; other duplicates
7194             are checked for validity. In the actual compile, there is nothing to
7195             do. */
7196              
7197 0 0         if (lengthptr != NULL)
7198             {
7199             named_group *ng;
7200 0           pcre_uint32 number = cd->bracount + 1;
7201              
7202 0 0         if (*ptr != (pcre_uchar)terminator)
7203             {
7204 0           *errorcodeptr = ERR42;
7205 0           goto FAILED;
7206             }
7207              
7208 0 0         if (cd->names_found >= MAX_NAME_COUNT)
7209             {
7210 0           *errorcodeptr = ERR49;
7211 0           goto FAILED;
7212             }
7213              
7214 0 0         if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7215             {
7216 0           cd->name_entry_size = namelen + IMM2_SIZE + 1;
7217 0 0         if (namelen > MAX_NAME_SIZE)
7218             {
7219 0           *errorcodeptr = ERR48;
7220 0           goto FAILED;
7221             }
7222             }
7223              
7224             /* Scan the list to check for duplicates. For duplicate names, if the
7225             number is the same, break the loop, which causes the name to be
7226             discarded; otherwise, if DUPNAMES is not set, give an error.
7227             If it is set, allow the name with a different number, but continue
7228             scanning in case this is a duplicate with the same number. For
7229             non-duplicate names, give an error if the number is duplicated. */
7230              
7231 0           ng = cd->named_groups;
7232 0 0         for (i = 0; i < cd->names_found; i++, ng++)
7233             {
7234 0 0         if (namelen == ng->length &&
    0          
7235 0           STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7236             {
7237 0 0         if (ng->number == number) break;
7238 0 0         if ((options & PCRE_DUPNAMES) == 0)
7239             {
7240 0           *errorcodeptr = ERR43;
7241 0           goto FAILED;
7242             }
7243 0           cd->dupnames = TRUE; /* Duplicate names exist */
7244             }
7245 0 0         else if (ng->number == number)
7246             {
7247 0           *errorcodeptr = ERR65;
7248 0           goto FAILED;
7249             }
7250             }
7251              
7252 0 0         if (i >= cd->names_found) /* Not a duplicate with same number */
7253             {
7254             /* Increase the list size if necessary */
7255              
7256 0 0         if (cd->names_found >= cd->named_group_list_size)
7257             {
7258 0           int newsize = cd->named_group_list_size * 2;
7259 0           named_group *newspace = (PUBL(malloc))
7260             (newsize * sizeof(named_group));
7261              
7262 0 0         if (newspace == NULL)
7263             {
7264 0           *errorcodeptr = ERR21;
7265 0           goto FAILED;
7266             }
7267              
7268 0           memcpy(newspace, cd->named_groups,
7269 0           cd->named_group_list_size * sizeof(named_group));
7270 0 0         if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7271 0           (PUBL(free))((void *)cd->named_groups);
7272 0           cd->named_groups = newspace;
7273 0           cd->named_group_list_size = newsize;
7274             }
7275              
7276 0           cd->named_groups[cd->names_found].name = name;
7277 0           cd->named_groups[cd->names_found].length = namelen;
7278 0           cd->named_groups[cd->names_found].number = number;
7279 0           cd->names_found++;
7280             }
7281             }
7282              
7283 0           ptr++; /* Move past > or ' in both passes. */
7284 0           goto NUMBERED_GROUP;
7285              
7286              
7287             /* ------------------------------------------------------------ */
7288             case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
7289 0           terminator = CHAR_RIGHT_PARENTHESIS;
7290 0           is_recurse = TRUE;
7291             /* Fall through */
7292              
7293             /* We come here from the Python syntax above that handles both
7294             references (?P=name) and recursion (?P>name), as well as falling
7295             through from the Perl recursion syntax (?&name). We also come here from
7296             the Perl \k or \k'name' back reference syntax and the \k{name}
7297             .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7298              
7299             NAMED_REF_OR_RECURSE:
7300 0           name = ++ptr;
7301 0 0         if (IS_DIGIT(*ptr))
    0          
7302             {
7303 0           *errorcodeptr = ERR84; /* Group name must start with non-digit */
7304 0           goto FAILED;
7305             }
7306 0 0         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7307 0           namelen = (int)(ptr - name);
7308              
7309             /* In the pre-compile phase, do a syntax check. We used to just set
7310             a dummy reference number, because it was not used in the first pass.
7311             However, with the change of recursive back references to be atomic,
7312             we have to look for the number so that this state can be identified, as
7313             otherwise the incorrect length is computed. If it's not a backwards
7314             reference, the dummy number will do. */
7315              
7316 0 0         if (lengthptr != NULL)
7317             {
7318             named_group *ng;
7319 0           recno = 0;
7320              
7321 0 0         if (namelen == 0)
7322             {
7323 0           *errorcodeptr = ERR62;
7324 0           goto FAILED;
7325             }
7326 0 0         if (*ptr != (pcre_uchar)terminator)
7327             {
7328 0           *errorcodeptr = ERR42;
7329 0           goto FAILED;
7330             }
7331 0 0         if (namelen > MAX_NAME_SIZE)
7332             {
7333 0           *errorcodeptr = ERR48;
7334 0           goto FAILED;
7335             }
7336              
7337             /* Count named back references. */
7338              
7339 0 0         if (!is_recurse) cd->namedrefcount++;
7340              
7341             /* We have to allow for a named reference to a duplicated name (this
7342             cannot be determined until the second pass). This needs an extra
7343             16-bit data item. */
7344              
7345 0           *lengthptr += IMM2_SIZE;
7346              
7347             /* If this is a forward reference and we are within a (?|...) group,
7348             the reference may end up as the number of a group which we are
7349             currently inside, that is, it could be a recursive reference. In the
7350             real compile this will be picked up and the reference wrapped with
7351             OP_ONCE to make it atomic, so we must space in case this occurs. */
7352              
7353             /* In fact, this can happen for a non-forward reference because
7354             another group with the same number might be created later. This
7355             issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7356             only mode, we finesse the bug by allowing more memory always. */
7357              
7358 0           *lengthptr += 4 + 4*LINK_SIZE;
7359              
7360             /* It is even worse than that. The current reference may be to an
7361             existing named group with a different number (so apparently not
7362             recursive) but which later on is also attached to a group with the
7363             current number. This can only happen if $(| has been previous
7364             encountered. In that case, we allow yet more memory, just in case.
7365             (Again, this is fixed "properly" in PCRE2. */
7366              
7367 0 0         if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7368              
7369             /* Otherwise, check for recursion here. The name table does not exist
7370             in the first pass; instead we must scan the list of names encountered
7371             so far in order to get the number. If the name is not found, leave
7372             the value of recno as 0 for a forward reference. */
7373              
7374             /* This patch (removing "else") fixes a problem when a reference is
7375             to multiple identically named nested groups from within the nest.
7376             Once again, it is not the "proper" fix, and it results in an
7377             over-allocation of memory. */
7378              
7379             /* else */
7380             {
7381 0           ng = cd->named_groups;
7382 0 0         for (i = 0; i < cd->names_found; i++, ng++)
7383             {
7384 0 0         if (namelen == ng->length &&
    0          
7385 0           STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7386             {
7387             open_capitem *oc;
7388 0           recno = ng->number;
7389 0 0         if (is_recurse) break;
7390 0 0         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7391             {
7392 0 0         if (oc->number == recno)
7393             {
7394 0           oc->flag = TRUE;
7395 0           break;
7396             }
7397             }
7398             }
7399             }
7400             }
7401             }
7402              
7403             /* In the real compile, search the name table. We check the name
7404             first, and then check that we have reached the end of the name in the
7405             table. That way, if the name is longer than any in the table, the
7406             comparison will fail without reading beyond the table entry. */
7407              
7408             else
7409             {
7410 0           slot = cd->name_table;
7411 0 0         for (i = 0; i < cd->names_found; i++)
7412             {
7413 0 0         if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
    0          
7414 0           slot[IMM2_SIZE+namelen] == 0)
7415 0           break;
7416 0           slot += cd->name_entry_size;
7417             }
7418              
7419 0 0         if (i < cd->names_found)
7420             {
7421 0           recno = GET2(slot, 0);
7422             }
7423             else
7424             {
7425 0           *errorcodeptr = ERR15;
7426 0           goto FAILED;
7427             }
7428             }
7429              
7430             /* In both phases, for recursions, we can now go to the code than
7431             handles numerical recursion. */
7432              
7433 0 0         if (is_recurse) goto HANDLE_RECURSION;
7434              
7435             /* In the second pass we must see if the name is duplicated. If so, we
7436             generate a different opcode. */
7437              
7438 0 0         if (lengthptr == NULL && cd->dupnames)
    0          
7439             {
7440 0           int count = 1;
7441 0           unsigned int index = i;
7442 0           pcre_uchar *cslot = slot + cd->name_entry_size;
7443              
7444 0 0         for (i++; i < cd->names_found; i++)
7445             {
7446 0 0         if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7447 0           count++;
7448 0           cslot += cd->name_entry_size;
7449             }
7450              
7451 0 0         if (count > 1)
7452             {
7453 0 0         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7454 0           previous = code;
7455 0           item_hwm_offset = cd->hwm - cd->start_workspace;
7456 0 0         *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7457 0           PUT2INC(code, 0, index);
7458 0           PUT2INC(code, 0, count);
7459              
7460             /* Process each potentially referenced group. */
7461              
7462 0 0         for (; slot < cslot; slot += cd->name_entry_size)
7463             {
7464             open_capitem *oc;
7465 0           recno = GET2(slot, 0);
7466 0 0         cd->backref_map |= (recno < 32)? (1U << recno) : 1;
7467 0 0         if (recno > cd->top_backref) cd->top_backref = recno;
7468              
7469             /* Check to see if this back reference is recursive, that it, it
7470             is inside the group that it references. A flag is set so that the
7471             group can be made atomic. */
7472              
7473 0 0         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7474             {
7475 0 0         if (oc->number == recno)
7476             {
7477 0           oc->flag = TRUE;
7478 0           break;
7479             }
7480             }
7481             }
7482              
7483 0           continue; /* End of back ref handling */
7484             }
7485             }
7486              
7487             /* First pass, or a non-duplicated name. */
7488              
7489 0           goto HANDLE_REFERENCE;
7490              
7491              
7492             /* ------------------------------------------------------------ */
7493             case CHAR_R: /* Recursion, same as (?0) */
7494 0           recno = 0;
7495 0 0         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7496             {
7497 0           *errorcodeptr = ERR29;
7498 0           goto FAILED;
7499             }
7500 0           goto HANDLE_RECURSION;
7501              
7502              
7503             /* ------------------------------------------------------------ */
7504             case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
7505             case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7506             case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7507             {
7508             const pcre_uchar *called;
7509 0           terminator = CHAR_RIGHT_PARENTHESIS;
7510              
7511             /* Come here from the \g<...> and \g'...' code (Oniguruma
7512             compatibility). However, the syntax has been checked to ensure that
7513             the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7514             be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7515             ever be taken. */
7516              
7517             HANDLE_NUMERICAL_RECURSION:
7518              
7519 0 0         if ((refsign = *ptr) == CHAR_PLUS)
7520             {
7521 0           ptr++;
7522 0 0         if (!IS_DIGIT(*ptr))
    0          
7523             {
7524 0           *errorcodeptr = ERR63;
7525 0           goto FAILED;
7526             }
7527             }
7528 0 0         else if (refsign == CHAR_MINUS)
7529             {
7530 0 0         if (!IS_DIGIT(ptr[1]))
    0          
7531             goto OTHER_CHAR_AFTER_QUERY;
7532 0           ptr++;
7533             }
7534              
7535 0           recno = 0;
7536 0 0         while(IS_DIGIT(*ptr))
    0          
7537             {
7538 0 0         if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7539             {
7540 0 0         while (IS_DIGIT(*ptr)) ptr++;
    0          
7541 0           *errorcodeptr = ERR61;
7542 0           goto FAILED;
7543             }
7544 0           recno = recno * 10 + *ptr++ - CHAR_0;
7545             }
7546              
7547 0 0         if (*ptr != (pcre_uchar)terminator)
7548             {
7549 0           *errorcodeptr = ERR29;
7550 0           goto FAILED;
7551             }
7552              
7553 0 0         if (refsign == CHAR_MINUS)
7554             {
7555 0 0         if (recno == 0)
7556             {
7557 0           *errorcodeptr = ERR58;
7558 0           goto FAILED;
7559             }
7560 0           recno = cd->bracount - recno + 1;
7561 0 0         if (recno <= 0)
7562             {
7563 0           *errorcodeptr = ERR15;
7564 0           goto FAILED;
7565             }
7566             }
7567 0 0         else if (refsign == CHAR_PLUS)
7568             {
7569 0 0         if (recno == 0)
7570             {
7571 0           *errorcodeptr = ERR58;
7572 0           goto FAILED;
7573             }
7574 0           recno += cd->bracount;
7575             }
7576              
7577             /* Come here from code above that handles a named recursion */
7578              
7579             HANDLE_RECURSION:
7580              
7581 0           previous = code;
7582 0           item_hwm_offset = cd->hwm - cd->start_workspace;
7583 0           called = cd->start_code;
7584              
7585             /* When we are actually compiling, find the bracket that is being
7586             referenced. Temporarily end the regex in case it doesn't exist before
7587             this point. If we end up with a forward reference, first check that
7588             the bracket does occur later so we can give the error (and position)
7589             now. Then remember this forward reference in the workspace so it can
7590             be filled in at the end. */
7591              
7592 0 0         if (lengthptr == NULL)
7593             {
7594 0           *code = OP_END;
7595 0 0         if (recno != 0)
7596 0           called = PRIV(find_bracket)(cd->start_code, utf, recno);
7597              
7598             /* Forward reference */
7599              
7600 0 0         if (called == NULL)
7601             {
7602 0 0         if (recno > cd->final_bracount)
7603             {
7604 0           *errorcodeptr = ERR15;
7605 0           goto FAILED;
7606             }
7607              
7608             /* Fudge the value of "called" so that when it is inserted as an
7609             offset below, what it actually inserted is the reference number
7610             of the group. Then remember the forward reference. */
7611              
7612 0           called = cd->start_code + recno;
7613 0 0         if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7614             WORK_SIZE_SAFETY_MARGIN)
7615             {
7616 0           *errorcodeptr = expand_workspace(cd);
7617 0 0         if (*errorcodeptr != 0) goto FAILED;
7618             }
7619 0           PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7620             }
7621              
7622             /* If not a forward reference, and the subpattern is still open,
7623             this is a recursive call. We check to see if this is a left
7624             recursion that could loop for ever, and diagnose that case. We
7625             must not, however, do this check if we are in a conditional
7626             subpattern because the condition might be testing for recursion in
7627             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7628             Forever loops are also detected at runtime, so those that occur in
7629             conditional subpatterns will be picked up then. */
7630              
7631 0 0         else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7632 0           could_be_empty(called, code, bcptr, utf, cd))
7633             {
7634 0           *errorcodeptr = ERR40;
7635 0           goto FAILED;
7636             }
7637             }
7638              
7639             /* Insert the recursion/subroutine item. It does not have a set first
7640             character (relevant if it is repeated, because it will then be
7641             wrapped with ONCE brackets). */
7642              
7643 0           *code = OP_RECURSE;
7644 0           PUT(code, 1, (int)(called - cd->start_code));
7645 0           code += 1 + LINK_SIZE;
7646 0           groupsetfirstchar = FALSE;
7647             }
7648              
7649             /* Can't determine a first byte now */
7650              
7651 0 0         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7652 0           zerofirstchar = firstchar;
7653 0           zerofirstcharflags = firstcharflags;
7654 0           continue;
7655              
7656              
7657             /* ------------------------------------------------------------ */
7658             default: /* Other characters: check option setting */
7659             OTHER_CHAR_AFTER_QUERY:
7660 0           set = unset = 0;
7661 0           optset = &set;
7662              
7663 0 0         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
    0          
7664             {
7665 0           switch (*ptr++)
7666             {
7667 0           case CHAR_MINUS: optset = &unset; break;
7668              
7669             case CHAR_J: /* Record that it changed in the external options */
7670 0           *optset |= PCRE_DUPNAMES;
7671 0           cd->external_flags |= PCRE_JCHANGED;
7672 0           break;
7673              
7674 0           case CHAR_i: *optset |= PCRE_CASELESS; break;
7675 0           case CHAR_m: *optset |= PCRE_MULTILINE; break;
7676 0           case CHAR_s: *optset |= PCRE_DOTALL; break;
7677 0           case CHAR_x: *optset |= PCRE_EXTENDED; break;
7678 0           case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7679 0           case CHAR_X: *optset |= PCRE_EXTRA; break;
7680              
7681 0           default: *errorcodeptr = ERR12;
7682 0           ptr--; /* Correct the offset */
7683 0           goto FAILED;
7684             }
7685             }
7686              
7687             /* Set up the changed option bits, but don't change anything yet. */
7688              
7689 0           newoptions = (options | set) & (~unset);
7690              
7691             /* If the options ended with ')' this is not the start of a nested
7692             group with option changes, so the options change at this level.
7693             If we are not at the pattern start, reset the greedy defaults and the
7694             case value for firstchar and reqchar. */
7695              
7696 0 0         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7697             {
7698 0           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7699 0           greedy_non_default = greedy_default ^ 1;
7700 0           req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7701              
7702             /* Change options at this level, and pass them back for use
7703             in subsequent branches. */
7704              
7705 0           *optionsptr = options = newoptions;
7706 0           previous = NULL; /* This item can't be repeated */
7707 0           continue; /* It is complete */
7708             }
7709              
7710             /* If the options ended with ':' we are heading into a nested group
7711             with possible change of options. Such groups are non-capturing and are
7712             not assertions of any kind. All we need to do is skip over the ':';
7713             the newoptions value is handled below. */
7714              
7715 0           bravalue = OP_BRA;
7716 0           ptr++;
7717             } /* End of switch for character following (? */
7718             } /* End of (? handling */
7719              
7720             /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7721             is set, all unadorned brackets become non-capturing and behave like (?:...)
7722             brackets. */
7723              
7724 6 50         else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7725             {
7726 0           bravalue = OP_BRA;
7727             }
7728              
7729             /* Else we have a capturing group. */
7730              
7731             else
7732             {
7733             NUMBERED_GROUP:
7734 6           cd->bracount += 1;
7735 6           PUT2(code, 1+LINK_SIZE, cd->bracount);
7736 6           skipbytes = IMM2_SIZE;
7737             }
7738              
7739             /* Process nested bracketed regex. First check for parentheses nested too
7740             deeply. */
7741              
7742 6 50         if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7743             {
7744 0           *errorcodeptr = ERR82;
7745 0           goto FAILED;
7746             }
7747              
7748             /* All assertions used not to be repeatable, but this was changed for Perl
7749             compatibility. All kinds can now be repeated except for assertions that are
7750             conditions (Perl also forbids these to be repeated). We copy code into a
7751             non-register variable (tempcode) in order to be able to pass its address
7752             because some compilers complain otherwise. At the start of a conditional
7753             group whose condition is an assertion, cd->iscondassert is set. We unset it
7754             here so as to allow assertions later in the group to be quantified. */
7755              
7756 6 50         if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
    50          
    0          
7757 0           cd->iscondassert)
7758             {
7759 0           previous = NULL;
7760 0           cd->iscondassert = FALSE;
7761             }
7762             else
7763             {
7764 6           previous = code;
7765 6           item_hwm_offset = cd->hwm - cd->start_workspace;
7766             }
7767              
7768 6           *code = bravalue;
7769 6           tempcode = code;
7770 6           tempreqvary = cd->req_varyopt; /* Save value before bracket */
7771 6           tempbracount = cd->bracount; /* Save value before bracket */
7772 6           length_prevgroup = 0; /* Initialize for pre-compile phase */
7773              
7774 6 100         if (!compile_regex(
    50          
    50          
    50          
7775             newoptions, /* The complete new option state */
7776             &tempcode, /* Where to put code (updated) */
7777             &ptr, /* Input pointer (updated) */
7778             errorcodeptr, /* Where to put an error message */
7779             (bravalue == OP_ASSERTBACK ||
7780             bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7781             reset_bracount, /* True if (?| group */
7782             skipbytes, /* Skip over bracket number */
7783             cond_depth +
7784 6           ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
7785             &subfirstchar, /* For possible first char */
7786             &subfirstcharflags,
7787             &subreqchar, /* For possible last char */
7788             &subreqcharflags,
7789             bcptr, /* Current branch chain */
7790             cd, /* Tables block */
7791             (lengthptr == NULL)? NULL : /* Actual compile phase */
7792             &length_prevgroup /* Pre-compile phase */
7793             ))
7794 0           goto FAILED;
7795              
7796 6           cd->parens_depth -= 1;
7797              
7798             /* If this was an atomic group and there are no capturing groups within it,
7799             generate OP_ONCE_NC instead of OP_ONCE. */
7800              
7801 6 50         if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
    0          
7802 0           *code = OP_ONCE_NC;
7803              
7804 6 50         if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
    50          
7805 0           cd->assert_depth -= 1;
7806              
7807             /* At the end of compiling, code is still pointing to the start of the
7808             group, while tempcode has been updated to point past the end of the group.
7809             The pattern pointer (ptr) is on the bracket.
7810              
7811             If this is a conditional bracket, check that there are no more than
7812             two branches in the group, or just one if it's a DEFINE group. We do this
7813             in the real compile phase, not in the pre-pass, where the whole group may
7814             not be available. */
7815              
7816 6 50         if (bravalue == OP_COND && lengthptr == NULL)
    0          
7817             {
7818 0           pcre_uchar *tc = code;
7819 0           int condcount = 0;
7820              
7821             do {
7822 0           condcount++;
7823 0           tc += GET(tc,1);
7824             }
7825 0 0         while (*tc != OP_KET);
7826              
7827             /* A DEFINE group is never obeyed inline (the "condition" is always
7828             false). It must have only one branch. */
7829              
7830 0 0         if (code[LINK_SIZE+1] == OP_DEF)
7831             {
7832 0 0         if (condcount > 1)
7833             {
7834 0           *errorcodeptr = ERR54;
7835 0           goto FAILED;
7836             }
7837 0           bravalue = OP_DEF; /* Just a flag to suppress char handling below */
7838             }
7839              
7840             /* A "normal" conditional group. If there is just one branch, we must not
7841             make use of its firstchar or reqchar, because this is equivalent to an
7842             empty second branch. */
7843              
7844             else
7845             {
7846 0 0         if (condcount > 2)
7847             {
7848 0           *errorcodeptr = ERR27;
7849 0           goto FAILED;
7850             }
7851 0 0         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7852             }
7853             }
7854              
7855             /* Error if hit end of pattern */
7856              
7857 6 50         if (*ptr != CHAR_RIGHT_PARENTHESIS)
7858             {
7859 0           *errorcodeptr = ERR14;
7860 0           goto FAILED;
7861             }
7862              
7863             /* In the pre-compile phase, update the length by the length of the group,
7864             less the brackets at either end. Then reduce the compiled code to just a
7865             set of non-capturing brackets so that it doesn't use much memory if it is
7866             duplicated by a quantifier.*/
7867              
7868 6 100         if (lengthptr != NULL)
7869             {
7870 3 50         if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7871             {
7872 0           *errorcodeptr = ERR20;
7873 0           goto FAILED;
7874             }
7875 3           *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7876 3           code++; /* This already contains bravalue */
7877 3           PUTINC(code, 0, 1 + LINK_SIZE);
7878 3           *code++ = OP_KET;
7879 3           PUTINC(code, 0, 1 + LINK_SIZE);
7880 3           break; /* No need to waste time with special character handling */
7881             }
7882              
7883             /* Otherwise update the main code pointer to the end of the group. */
7884              
7885 3           code = tempcode;
7886              
7887             /* For a DEFINE group, required and first character settings are not
7888             relevant. */
7889              
7890 3 50         if (bravalue == OP_DEF) break;
7891              
7892             /* Handle updating of the required and first characters for other types of
7893             group. Update for normal brackets of all kinds, and conditions with two
7894             branches (see code above). If the bracket is followed by a quantifier with
7895             zero repeat, we have to back off. Hence the definition of zeroreqchar and
7896             zerofirstchar outside the main loop so that they can be accessed for the
7897             back off. */
7898              
7899 3           zeroreqchar = reqchar;
7900 3           zeroreqcharflags = reqcharflags;
7901 3           zerofirstchar = firstchar;
7902 3           zerofirstcharflags = firstcharflags;
7903 3           groupsetfirstchar = FALSE;
7904              
7905 3 50         if (bravalue >= OP_ONCE)
7906             {
7907             /* If we have not yet set a firstchar in this branch, take it from the
7908             subpattern, remembering that it was set here so that a repeat of more
7909             than one can replicate it as reqchar if necessary. If the subpattern has
7910             no firstchar, set "none" for the whole branch. In both cases, a zero
7911             repeat forces firstchar to "none". */
7912              
7913 3 50         if (firstcharflags == REQ_UNSET)
7914             {
7915 0 0         if (subfirstcharflags >= 0)
7916             {
7917 0           firstchar = subfirstchar;
7918 0           firstcharflags = subfirstcharflags;
7919 0           groupsetfirstchar = TRUE;
7920             }
7921 0           else firstcharflags = REQ_NONE;
7922 0           zerofirstcharflags = REQ_NONE;
7923             }
7924              
7925             /* If firstchar was previously set, convert the subpattern's firstchar
7926             into reqchar if there wasn't one, using the vary flag that was in
7927             existence beforehand. */
7928              
7929 3 50         else if (subfirstcharflags >= 0 && subreqcharflags < 0)
    50          
7930             {
7931 0           subreqchar = subfirstchar;
7932 0           subreqcharflags = subfirstcharflags | tempreqvary;
7933             }
7934              
7935             /* If the subpattern set a required byte (or set a first byte that isn't
7936             really the first byte - see above), set it. */
7937              
7938 3 50         if (subreqcharflags >= 0)
7939             {
7940 3           reqchar = subreqchar;
7941 3           reqcharflags = subreqcharflags;
7942             }
7943             }
7944              
7945             /* For a forward assertion, we take the reqchar, if set, provided that the
7946             group has also set a first char. This can be helpful if the pattern that
7947             follows the assertion doesn't set a different char. For example, it's
7948             useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7949             because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7950             the "real" "a" would then become a reqchar instead of a firstchar. This is
7951             overcome by a scan at the end if there's no firstchar, looking for an
7952             asserted first char. */
7953              
7954 0 0         else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
    0          
    0          
7955 0           subfirstcharflags >= 0)
7956             {
7957 0           reqchar = subreqchar;
7958 0           reqcharflags = subreqcharflags;
7959             }
7960 3           break; /* End of processing '(' */
7961              
7962              
7963             /* ===================================================================*/
7964             /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7965             are arranged to be the negation of the corresponding OP_values in the
7966             default case when PCRE_UCP is not set. For the back references, the values
7967             are negative the reference number. Only back references and those types
7968             that consume a character may be repeated. We can test for values between
7969             ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7970             ever created. */
7971              
7972             case CHAR_BACKSLASH:
7973 164           tempptr = ptr;
7974 164           escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7975 164 50         if (*errorcodeptr != 0) goto FAILED;
7976              
7977 164 50         if (escape == 0) /* The escape coded a single character */
7978 164           c = ec;
7979             else
7980             {
7981             /* For metasequences that actually match a character, we disable the
7982             setting of a first character if it hasn't already been set. */
7983              
7984 0 0         if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
    0          
    0          
7985 0           firstcharflags = REQ_NONE;
7986              
7987             /* Set values to reset to if this is followed by a zero repeat. */
7988              
7989 0           zerofirstchar = firstchar;
7990 0           zerofirstcharflags = firstcharflags;
7991 0           zeroreqchar = reqchar;
7992 0           zeroreqcharflags = reqcharflags;
7993              
7994             /* \g or \g'name' is a subroutine call by name and \g or \g'n'
7995             is a subroutine call by number (Oniguruma syntax). In fact, the value
7996             ESC_g is returned only for these cases. So we don't need to check for <
7997             or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7998             -n, and for the Perl syntax \g{name} the result is ESC_k (as
7999             that is a synonym for a named back reference). */
8000              
8001 0 0         if (escape == ESC_g)
8002             {
8003             const pcre_uchar *p;
8004             pcre_uint32 cf;
8005              
8006 0           item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
8007 0           terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8008 0 0         CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
8009              
8010             /* These two statements stop the compiler for warning about possibly
8011             unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
8012             fact, because we do the check for a number below, the paths that
8013             would actually be in error are never taken. */
8014              
8015 0           skipbytes = 0;
8016 0           reset_bracount = FALSE;
8017              
8018             /* If it's not a signed or unsigned number, treat it as a name. */
8019              
8020 0           cf = ptr[1];
8021 0 0         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
    0          
    0          
    0          
8022             {
8023 0           is_recurse = TRUE;
8024 0           goto NAMED_REF_OR_RECURSE;
8025             }
8026              
8027             /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
8028             or a digit. */
8029              
8030 0           p = ptr + 2;
8031 0 0         while (IS_DIGIT(*p)) p++;
    0          
8032 0 0         if (*p != (pcre_uchar)terminator)
8033             {
8034 0           *errorcodeptr = ERR57;
8035 0           goto FAILED;
8036             }
8037 0           ptr++;
8038 0           goto HANDLE_NUMERICAL_RECURSION;
8039             }
8040              
8041             /* \k or \k'name' is a back reference by name (Perl syntax).
8042             We also support \k{name} (.NET syntax). */
8043              
8044 0 0         if (escape == ESC_k)
8045             {
8046 0 0         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
    0          
8047 0 0         ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
8048             {
8049 0           *errorcodeptr = ERR69;
8050 0           goto FAILED;
8051             }
8052 0           is_recurse = FALSE;
8053 0           terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8054 0 0         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8055 0 0         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8056 0           goto NAMED_REF_OR_RECURSE;
8057             }
8058              
8059             /* Back references are handled specially; must disable firstchar if
8060             not set to cope with cases like (?=(\w+))\1: which would otherwise set
8061             ':' later. */
8062              
8063 0 0         if (escape < 0)
8064             {
8065             open_capitem *oc;
8066 0           recno = -escape;
8067              
8068             /* Come here from named backref handling when the reference is to a
8069             single group (i.e. not to a duplicated name. */
8070              
8071             HANDLE_REFERENCE:
8072 0 0         if (firstcharflags == REQ_UNSET) zerofirstcharflags = firstcharflags = REQ_NONE;
8073 0           previous = code;
8074 0           item_hwm_offset = cd->hwm - cd->start_workspace;
8075 0 0         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8076 0           PUT2INC(code, 0, recno);
8077 0 0         cd->backref_map |= (recno < 32)? (1U << recno) : 1;
8078 0 0         if (recno > cd->top_backref) cd->top_backref = recno;
8079              
8080             /* Check to see if this back reference is recursive, that it, it
8081             is inside the group that it references. A flag is set so that the
8082             group can be made atomic. */
8083              
8084 0 0         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8085             {
8086 0 0         if (oc->number == recno)
8087             {
8088 0           oc->flag = TRUE;
8089 0           break;
8090             }
8091             }
8092             }
8093              
8094             /* So are Unicode property matches, if supported. */
8095              
8096             #ifdef SUPPORT_UCP
8097             else if (escape == ESC_P || escape == ESC_p)
8098             {
8099             BOOL negated;
8100             unsigned int ptype = 0, pdata = 0;
8101             if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8102             goto FAILED;
8103             previous = code;
8104             item_hwm_offset = cd->hwm - cd->start_workspace;
8105             *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8106             *code++ = ptype;
8107             *code++ = pdata;
8108             }
8109             #else
8110              
8111             /* If Unicode properties are not supported, \X, \P, and \p are not
8112             allowed. */
8113              
8114 0 0         else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
    0          
    0          
8115             {
8116 0           *errorcodeptr = ERR45;
8117 0           goto FAILED;
8118             }
8119             #endif
8120              
8121             /* For the rest (including \X when Unicode properties are supported), we
8122             can obtain the OP value by negating the escape value in the default
8123             situation when PCRE_UCP is not set. When it *is* set, we substitute
8124             Unicode property tests. Note that \b and \B do a one-character
8125             lookbehind, and \A also behaves as if it does. */
8126              
8127             else
8128             {
8129 0 0         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
    0          
    0          
    0          
8130 0           cd->max_lookbehind == 0)
8131 0           cd->max_lookbehind = 1;
8132             #ifdef SUPPORT_UCP
8133             if (escape >= ESC_DU && escape <= ESC_wu)
8134             {
8135             nestptr = ptr + 1; /* Where to resume */
8136             ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
8137             }
8138             else
8139             #endif
8140             /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8141             so that it works in DFA mode and in lookbehinds. */
8142              
8143             {
8144 0 0         previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
    0          
8145 0           item_hwm_offset = cd->hwm - cd->start_workspace;
8146 0 0         *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
    0          
8147             }
8148             }
8149 0           continue;
8150             }
8151              
8152             /* We have a data character whose value is in c. In UTF-8 mode it may have
8153             a value > 127. We set its representation in the length/buffer, and then
8154             handle it as a data character. */
8155              
8156             #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8157             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8158             mclength = PRIV(ord2utf)(c, mcbuffer);
8159             else
8160             #endif
8161              
8162             {
8163 164           mcbuffer[0] = c;
8164 164           mclength = 1;
8165             }
8166 164           goto ONE_CHAR;
8167              
8168              
8169             /* ===================================================================*/
8170             /* Handle a literal character. It is guaranteed not to be whitespace or #
8171             when the extended flag is set. If we are in a UTF mode, it may be a
8172             multi-unit literal character. */
8173              
8174             default:
8175             NORMAL_CHAR:
8176 1130           mclength = 1;
8177 1130           mcbuffer[0] = c;
8178              
8179             #ifdef SUPPORT_UTF
8180             if (utf && HAS_EXTRALEN(c))
8181             ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8182             #endif
8183              
8184             /* At this point we have the character's bytes in mcbuffer, and the length
8185             in mclength. When not in UTF-8 mode, the length is always 1. */
8186              
8187             ONE_CHAR:
8188 1294           previous = code;
8189 1294           item_hwm_offset = cd->hwm - cd->start_workspace;
8190              
8191             /* For caseless UTF-8 mode when UCP support is available, check whether
8192             this character has more than one other case. If so, generate a special
8193             OP_PROP item instead of OP_CHARI. */
8194              
8195             #ifdef SUPPORT_UCP
8196             if (utf && (options & PCRE_CASELESS) != 0)
8197             {
8198             GETCHAR(c, mcbuffer);
8199             if ((c = UCD_CASESET(c)) != 0)
8200             {
8201             *code++ = OP_PROP;
8202             *code++ = PT_CLIST;
8203             *code++ = c;
8204             if (firstcharflags == REQ_UNSET)
8205             firstcharflags = zerofirstcharflags = REQ_NONE;
8206             break;
8207             }
8208             }
8209             #endif
8210              
8211             /* Caseful matches, or not one of the multicase characters. */
8212              
8213 1294 50         *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8214 2588 100         for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8215              
8216             /* Remember if \r or \n were seen */
8217              
8218 1294 50         if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
    50          
8219 0           cd->external_flags |= PCRE_HASCRORLF;
8220              
8221             /* Set the first and required bytes appropriately. If no previous first
8222             byte, set it from this character, but revert to none on a zero repeat.
8223             Otherwise, leave the firstchar value alone, and don't change it on a zero
8224             repeat. */
8225              
8226 1294 100         if (firstcharflags == REQ_UNSET)
8227             {
8228 88           zerofirstcharflags = REQ_NONE;
8229 88           zeroreqchar = reqchar;
8230 88           zeroreqcharflags = reqcharflags;
8231              
8232             /* If the character is more than one byte long, we can set firstchar
8233             only if it is not to be matched caselessly. */
8234              
8235 88 50         if (mclength == 1 || req_caseopt == 0)
    0          
8236             {
8237 88           firstchar = mcbuffer[0];
8238 88           firstcharflags = req_caseopt;
8239              
8240 88 50         if (mclength != 1)
8241             {
8242 0           reqchar = code[-1];
8243 0           reqcharflags = cd->req_varyopt;
8244             }
8245             }
8246 88           else firstcharflags = reqcharflags = REQ_NONE;
8247             }
8248              
8249             /* firstchar was previously set; we can set reqchar only if the length is
8250             1 or the matching is caseful. */
8251              
8252             else
8253             {
8254 1206           zerofirstchar = firstchar;
8255 1206           zerofirstcharflags = firstcharflags;
8256 1206           zeroreqchar = reqchar;
8257 1206           zeroreqcharflags = reqcharflags;
8258 1206 50         if (mclength == 1 || req_caseopt == 0)
    0          
8259             {
8260 1206           reqchar = code[-1];
8261 1206           reqcharflags = req_caseopt | cd->req_varyopt;
8262             }
8263             }
8264              
8265 1502           break; /* End of literal character handling */
8266             }
8267 1502           } /* end of big loop */
8268              
8269              
8270             /* Control never reaches here by falling through, only by a goto for all the
8271             error states. Pass back the position in the pattern so that it can be displayed
8272             to the user for diagnosing the error. */
8273              
8274             FAILED:
8275 0           *ptrptr = ptr;
8276 98           return FALSE;
8277             }
8278              
8279              
8280              
8281             /*************************************************
8282             * Compile sequence of alternatives *
8283             *************************************************/
8284              
8285             /* On entry, ptr is pointing past the bracket character, but on return it
8286             points to the closing bracket, or vertical bar, or end of string. The code
8287             variable is pointing at the byte into which the BRA operator has been stored.
8288             This function is used during the pre-compile phase when we are trying to find
8289             out the amount of memory needed, as well as during the real compile phase. The
8290             value of lengthptr distinguishes the two phases.
8291              
8292             Arguments:
8293             options option bits, including any changes for this subpattern
8294             codeptr -> the address of the current code pointer
8295             ptrptr -> the address of the current pattern pointer
8296             errorcodeptr -> pointer to error code variable
8297             lookbehind TRUE if this is a lookbehind assertion
8298             reset_bracount TRUE to reset the count for each branch
8299             skipbytes skip this many bytes at start (for brackets and OP_COND)
8300             cond_depth depth of nesting for conditional subpatterns
8301             firstcharptr place to put the first required character
8302             firstcharflagsptr place to put the first character flags, or a negative number
8303             reqcharptr place to put the last required character
8304             reqcharflagsptr place to put the last required character flags, or a negative number
8305             bcptr pointer to the chain of currently open branches
8306             cd points to the data block with tables pointers etc.
8307             lengthptr NULL during the real compile phase
8308             points to length accumulator during pre-compile phase
8309              
8310             Returns: TRUE on success
8311             */
8312              
8313             static BOOL
8314 98           compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8315             int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8316             int cond_depth,
8317             pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8318             pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8319             branch_chain *bcptr, compile_data *cd, int *lengthptr)
8320             {
8321 98           const pcre_uchar *ptr = *ptrptr;
8322 98           pcre_uchar *code = *codeptr;
8323 98           pcre_uchar *last_branch = code;
8324 98           pcre_uchar *start_bracket = code;
8325 98           pcre_uchar *reverse_count = NULL;
8326             open_capitem capitem;
8327 98           int capnumber = 0;
8328             pcre_uint32 firstchar, reqchar;
8329             pcre_int32 firstcharflags, reqcharflags;
8330             pcre_uint32 branchfirstchar, branchreqchar;
8331             pcre_int32 branchfirstcharflags, branchreqcharflags;
8332             int length;
8333             unsigned int orig_bracount;
8334             unsigned int max_bracount;
8335             branch_chain bc;
8336             size_t save_hwm_offset;
8337              
8338             /* If set, call the external function that checks for stack availability. */
8339              
8340 98 50         if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
    0          
8341             {
8342 0           *errorcodeptr= ERR85;
8343 0           return FALSE;
8344             }
8345              
8346             /* Miscellaneous initialization */
8347              
8348 98           bc.outer = bcptr;
8349 98           bc.current_branch = code;
8350              
8351 98           firstchar = reqchar = 0;
8352 98           firstcharflags = reqcharflags = REQ_UNSET;
8353              
8354 98           save_hwm_offset = cd->hwm - cd->start_workspace;
8355              
8356             /* Accumulate the length for use in the pre-compile phase. Start with the
8357             length of the BRA and KET and any extra bytes that are required at the
8358             beginning. We accumulate in a local variable to save frequent testing of
8359             lenthptr for NULL. We cannot do this by looking at the value of code at the
8360             start and end of each alternative, because compiled items are discarded during
8361             the pre-compile phase so that the work space is not exceeded. */
8362              
8363 98           length = 2 + 2*LINK_SIZE + skipbytes;
8364              
8365             /* WARNING: If the above line is changed for any reason, you must also change
8366             the code that abstracts option settings at the start of the pattern and makes
8367             them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8368             pre-compile phase to find out whether anything has yet been compiled or not. */
8369              
8370             /* If this is a capturing subpattern, add to the chain of open capturing items
8371             so that we can detect them if (*ACCEPT) is encountered. This is also used to
8372             detect groups that contain recursive back references to themselves. Note that
8373             only OP_CBRA need be tested here; changing this opcode to one of its variants,
8374             e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8375              
8376 98 100         if (*code == OP_CBRA)
8377             {
8378 6           capnumber = GET2(code, 1 + LINK_SIZE);
8379 6           capitem.number = capnumber;
8380 6           capitem.next = cd->open_caps;
8381 6           capitem.flag = FALSE;
8382 6           cd->open_caps = &capitem;
8383             }
8384              
8385             /* Offset is set zero to mark that this bracket is still open */
8386              
8387 98           PUT(code, 1, 0);
8388 98           code += 1 + LINK_SIZE + skipbytes;
8389              
8390             /* Loop for each alternative branch */
8391              
8392 98           orig_bracount = max_bracount = cd->bracount;
8393             for (;;)
8394             {
8395             /* For a (?| group, reset the capturing bracket count so that each branch
8396             uses the same numbers. */
8397              
8398 98 50         if (reset_bracount) cd->bracount = orig_bracount;
8399              
8400             /* Set up dummy OP_REVERSE if lookbehind assertion */
8401              
8402 98 50         if (lookbehind)
8403             {
8404 0           *code++ = OP_REVERSE;
8405 0           reverse_count = code;
8406 0           PUTINC(code, 0, 0);
8407 0           length += 1 + LINK_SIZE;
8408             }
8409              
8410             /* Now compile the branch; in the pre-compile phase its length gets added
8411             into the length. */
8412              
8413 98 100         if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
    50          
8414             &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8415             cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8416             {
8417 0           *ptrptr = ptr;
8418 0           return FALSE;
8419             }
8420              
8421             /* Keep the highest bracket count in case (?| was used and some branch
8422             has fewer than the rest. */
8423              
8424 98 100         if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8425              
8426             /* In the real compile phase, there is some post-processing to be done. */
8427              
8428 98 100         if (lengthptr == NULL)
8429             {
8430             /* If this is the first branch, the firstchar and reqchar values for the
8431             branch become the values for the regex. */
8432              
8433 49 50         if (*last_branch != OP_ALT)
8434             {
8435 49           firstchar = branchfirstchar;
8436 49           firstcharflags = branchfirstcharflags;
8437 49           reqchar = branchreqchar;
8438 49           reqcharflags = branchreqcharflags;
8439             }
8440              
8441             /* If this is not the first branch, the first char and reqchar have to
8442             match the values from all the previous branches, except that if the
8443             previous value for reqchar didn't have REQ_VARY set, it can still match,
8444             and we set REQ_VARY for the regex. */
8445              
8446             else
8447             {
8448             /* If we previously had a firstchar, but it doesn't match the new branch,
8449             we have to abandon the firstchar for the regex, but if there was
8450             previously no reqchar, it takes on the value of the old firstchar. */
8451              
8452 0 0         if (firstcharflags >= 0 &&
    0          
8453 0 0         (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8454             {
8455 0 0         if (reqcharflags < 0)
8456             {
8457 0           reqchar = firstchar;
8458 0           reqcharflags = firstcharflags;
8459             }
8460 0           firstcharflags = REQ_NONE;
8461             }
8462              
8463             /* If we (now or from before) have no firstchar, a firstchar from the
8464             branch becomes a reqchar if there isn't a branch reqchar. */
8465              
8466 0 0         if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
    0          
    0          
8467             {
8468 0           branchreqchar = branchfirstchar;
8469 0           branchreqcharflags = branchfirstcharflags;
8470             }
8471              
8472             /* Now ensure that the reqchars match */
8473              
8474 0 0         if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
    0          
8475 0           reqchar != branchreqchar)
8476 0           reqcharflags = REQ_NONE;
8477             else
8478             {
8479 0           reqchar = branchreqchar;
8480 0           reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8481             }
8482             }
8483              
8484             /* If lookbehind, check that this branch matches a fixed-length string, and
8485             put the length into the OP_REVERSE item. Temporarily mark the end of the
8486             branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8487             because there may be forward references that we can't check here. Set a
8488             flag to cause another lookbehind check at the end. Why not do it all at the
8489             end? Because common, erroneous checks are picked up here and the offset of
8490             the problem can be shown. */
8491              
8492 49 50         if (lookbehind)
8493             {
8494             int fixed_length;
8495 0           *code = OP_END;
8496 0           fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
8497             FALSE, cd, NULL);
8498             DPRINTF(("fixed length = %d\n", fixed_length));
8499 0 0         if (fixed_length == -3)
8500             {
8501 0           cd->check_lookbehind = TRUE;
8502             }
8503 0 0         else if (fixed_length < 0)
8504             {
8505 0 0         *errorcodeptr = (fixed_length == -2)? ERR36 :
8506 0 0         (fixed_length == -4)? ERR70: ERR25;
8507 0           *ptrptr = ptr;
8508 0           return FALSE;
8509             }
8510             else
8511             {
8512 0 0         if (fixed_length > cd->max_lookbehind)
8513 0           cd->max_lookbehind = fixed_length;
8514 0           PUT(reverse_count, 0, fixed_length);
8515             }
8516             }
8517             }
8518              
8519             /* Reached end of expression, either ')' or end of pattern. In the real
8520             compile phase, go back through the alternative branches and reverse the chain
8521             of offsets, with the field in the BRA item now becoming an offset to the
8522             first alternative. If there are no alternatives, it points to the end of the
8523             group. The length in the terminating ket is always the length of the whole
8524             bracketed item. Return leaving the pointer at the terminating char. */
8525              
8526 98 50         if (*ptr != CHAR_VERTICAL_LINE)
8527             {
8528 98 100         if (lengthptr == NULL)
8529             {
8530 49           int branch_length = (int)(code - last_branch);
8531             do
8532             {
8533 49           int prev_length = GET(last_branch, 1);
8534 49           PUT(last_branch, 1, branch_length);
8535 49           branch_length = prev_length;
8536 49           last_branch -= branch_length;
8537             }
8538 49 50         while (branch_length > 0);
8539             }
8540              
8541             /* Fill in the ket */
8542              
8543 98           *code = OP_KET;
8544 98           PUT(code, 1, (int)(code - start_bracket));
8545 98           code += 1 + LINK_SIZE;
8546              
8547             /* If it was a capturing subpattern, check to see if it contained any
8548             recursive back references. If so, we must wrap it in atomic brackets.
8549             Because we are moving code along, we must ensure that any pending recursive
8550             references are updated. In any event, remove the block from the chain. */
8551              
8552 98 100         if (capnumber > 0)
8553             {
8554 6 50         if (cd->open_caps->flag)
8555             {
8556 0           *code = OP_END;
8557 0           adjust_recurse(start_bracket, 1 + LINK_SIZE,
8558 0           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8559 0           memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8560             IN_UCHARS(code - start_bracket));
8561 0           *start_bracket = OP_ONCE;
8562 0           code += 1 + LINK_SIZE;
8563 0           PUT(start_bracket, 1, (int)(code - start_bracket));
8564 0           *code = OP_KET;
8565 0           PUT(code, 1, (int)(code - start_bracket));
8566 0           code += 1 + LINK_SIZE;
8567 0           length += 2 + 2*LINK_SIZE;
8568             }
8569 6           cd->open_caps = cd->open_caps->next;
8570             }
8571              
8572             /* Retain the highest bracket number, in case resetting was used. */
8573              
8574 98           cd->bracount = max_bracount;
8575              
8576             /* Set values to pass back */
8577              
8578 98           *codeptr = code;
8579 98           *ptrptr = ptr;
8580 98           *firstcharptr = firstchar;
8581 98           *firstcharflagsptr = firstcharflags;
8582 98           *reqcharptr = reqchar;
8583 98           *reqcharflagsptr = reqcharflags;
8584 98 100         if (lengthptr != NULL)
8585             {
8586 49 50         if (OFLOW_MAX - *lengthptr < length)
8587             {
8588 0           *errorcodeptr = ERR20;
8589 0           return FALSE;
8590             }
8591 49           *lengthptr += length;
8592             }
8593 98           return TRUE;
8594             }
8595              
8596             /* Another branch follows. In the pre-compile phase, we can move the code
8597             pointer back to where it was for the start of the first branch. (That is,
8598             pretend that each branch is the only one.)
8599              
8600             In the real compile phase, insert an ALT node. Its length field points back
8601             to the previous branch while the bracket remains open. At the end the chain
8602             is reversed. It's done like this so that the start of the bracket has a
8603             zero offset until it is closed, making it possible to detect recursion. */
8604              
8605 0 0         if (lengthptr != NULL)
8606             {
8607 0           code = *codeptr + 1 + LINK_SIZE + skipbytes;
8608 0           length += 1 + LINK_SIZE;
8609             }
8610             else
8611             {
8612 0           *code = OP_ALT;
8613 0           PUT(code, 1, (int)(code - last_branch));
8614 0           bc.current_branch = last_branch = code;
8615 0           code += 1 + LINK_SIZE;
8616             }
8617              
8618 0           ptr++;
8619 98           }
8620             /* Control never reaches here */
8621             }
8622              
8623              
8624              
8625              
8626             /*************************************************
8627             * Check for anchored expression *
8628             *************************************************/
8629              
8630             /* Try to find out if this is an anchored regular expression. Consider each
8631             alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8632             all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8633             it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8634             be found, because ^ generates OP_CIRCM in that mode.
8635              
8636             We can also consider a regex to be anchored if OP_SOM starts all its branches.
8637             This is the code for \G, which means "match at start of match position, taking
8638             into account the match offset".
8639              
8640             A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8641             because that will try the rest of the pattern at all possible matching points,
8642             so there is no point trying again.... er ....
8643              
8644             .... except when the .* appears inside capturing parentheses, and there is a
8645             subsequent back reference to those parentheses. We haven't enough information
8646             to catch that case precisely.
8647              
8648             At first, the best we could do was to detect when .* was in capturing brackets
8649             and the highest back reference was greater than or equal to that level.
8650             However, by keeping a bitmap of the first 31 back references, we can catch some
8651             of the more common cases more precisely.
8652              
8653             ... A second exception is when the .* appears inside an atomic group, because
8654             this prevents the number of characters it matches from being adjusted.
8655              
8656             Arguments:
8657             code points to start of expression (the bracket)
8658             bracket_map a bitmap of which brackets we are inside while testing; this
8659             handles up to substring 31; after that we just have to take
8660             the less precise approach
8661             cd points to the compile data block
8662             atomcount atomic group level
8663              
8664             Returns: TRUE or FALSE
8665             */
8666              
8667             static BOOL
8668 46           is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8669             compile_data *cd, int atomcount)
8670             {
8671             do {
8672 46           const pcre_uchar *scode = first_significant_code(
8673 46           code + PRIV(OP_lengths)[*code], FALSE);
8674 46           register int op = *scode;
8675              
8676             /* Non-capturing brackets */
8677              
8678 46 50         if (op == OP_BRA || op == OP_BRAPOS ||
    50          
    50          
8679 46 50         op == OP_SBRA || op == OP_SBRAPOS)
8680             {
8681 0 0         if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8682             }
8683              
8684             /* Capturing brackets */
8685              
8686 46 50         else if (op == OP_CBRA || op == OP_CBRAPOS ||
    50          
    50          
8687 46 50         op == OP_SCBRA || op == OP_SCBRAPOS)
8688 0           {
8689 0           int n = GET2(scode, 1+LINK_SIZE);
8690 0 0         int new_map = bracket_map | ((n < 32)? (1U << n) : 1);
8691 0 0         if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8692             }
8693              
8694             /* Positive forward assertion */
8695              
8696 46 50         else if (op == OP_ASSERT)
8697             {
8698 0 0         if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8699             }
8700              
8701             /* Condition; not anchored if no second branch */
8702              
8703 46 50         else if (op == OP_COND)
8704             {
8705 0 0         if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8706 0 0         if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8707             }
8708              
8709             /* Atomic groups */
8710              
8711 46 50         else if (op == OP_ONCE || op == OP_ONCE_NC)
    50          
8712             {
8713 0 0         if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8714 0           return FALSE;
8715             }
8716              
8717             /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8718             it isn't in brackets that are or may be referenced or inside an atomic
8719             group. */
8720              
8721 46 50         else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
    50          
    50          
8722             op == OP_TYPEPOSSTAR))
8723             {
8724 0 0         if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
    0          
    0          
8725 0 0         atomcount > 0 || cd->had_pruneorskip)
8726 0           return FALSE;
8727             }
8728              
8729             /* Check for explicit anchoring */
8730              
8731 46 50         else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
    50          
    100          
8732              
8733 5           code += GET(code, 1);
8734             }
8735 5 50         while (*code == OP_ALT); /* Loop for each alternative */
8736 5           return TRUE;
8737             }
8738              
8739              
8740              
8741             /*************************************************
8742             * Check for starting with ^ or .* *
8743             *************************************************/
8744              
8745             /* This is called to find out if every branch starts with ^ or .* so that
8746             "first char" processing can be done to speed things up in multiline
8747             matching and for non-DOTALL patterns that start with .* (which must start at
8748             the beginning or after \n). As in the case of is_anchored() (see above), we
8749             have to take account of back references to capturing brackets that contain .*
8750             because in that case we can't make the assumption. Also, the appearance of .*
8751             inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8752             or *SKIP does not count, because once again the assumption no longer holds.
8753              
8754             Arguments:
8755             code points to start of expression (the bracket)
8756             bracket_map a bitmap of which brackets we are inside while testing; this
8757             handles up to substring 31; after that we just have to take
8758             the less precise approach
8759             cd points to the compile data
8760             atomcount atomic group level
8761             inassert TRUE if in an assertion
8762              
8763             Returns: TRUE or FALSE
8764             */
8765              
8766             static BOOL
8767 3           is_startline(const pcre_uchar *code, unsigned int bracket_map,
8768             compile_data *cd, int atomcount, BOOL inassert)
8769             {
8770             do {
8771 3           const pcre_uchar *scode = first_significant_code(
8772 3           code + PRIV(OP_lengths)[*code], FALSE);
8773 3           register int op = *scode;
8774              
8775             /* If we are at the start of a conditional assertion group, *both* the
8776             conditional assertion *and* what follows the condition must satisfy the test
8777             for start of line. Other kinds of condition fail. Note that there may be an
8778             auto-callout at the start of a condition. */
8779              
8780 3 50         if (op == OP_COND)
8781             {
8782 0           scode += 1 + LINK_SIZE;
8783 0 0         if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8784 0 0         switch (*scode)
8785             {
8786             case OP_CREF:
8787             case OP_DNCREF:
8788             case OP_RREF:
8789             case OP_DNRREF:
8790             case OP_DEF:
8791             case OP_FAIL:
8792 0           return FALSE;
8793              
8794             default: /* Assertion */
8795 0 0         if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8796 0 0         do scode += GET(scode, 1); while (*scode == OP_ALT);
8797 0           scode += 1 + LINK_SIZE;
8798 0           break;
8799             }
8800 0           scode = first_significant_code(scode, FALSE);
8801 0           op = *scode;
8802             }
8803              
8804             /* Non-capturing brackets */
8805              
8806 3 50         if (op == OP_BRA || op == OP_BRAPOS ||
    50          
    50          
8807 3 50         op == OP_SBRA || op == OP_SBRAPOS)
8808             {
8809 0 0         if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8810             }
8811              
8812             /* Capturing brackets */
8813              
8814 3 50         else if (op == OP_CBRA || op == OP_CBRAPOS ||
    50          
    50          
8815 3 50         op == OP_SCBRA || op == OP_SCBRAPOS)
8816 0           {
8817 0           int n = GET2(scode, 1+LINK_SIZE);
8818 0 0         int new_map = bracket_map | ((n < 32)? (1U << n) : 1);
8819 0 0         if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8820             }
8821              
8822             /* Positive forward assertions */
8823              
8824 3 50         else if (op == OP_ASSERT)
8825             {
8826 0 0         if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8827             }
8828              
8829             /* Atomic brackets */
8830              
8831 3 50         else if (op == OP_ONCE || op == OP_ONCE_NC)
    50          
8832             {
8833 0 0         if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
8834             }
8835              
8836             /* .* means "start at start or after \n" if it isn't in atomic brackets or
8837             brackets that may be referenced or an assertion, as long as the pattern does
8838             not contain *PRUNE or *SKIP, because these break the feature. Consider, for
8839             example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8840             not at the start of a line. */
8841              
8842 3 50         else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
    50          
    50          
8843             {
8844 0 0         if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
    0          
    0          
8845 0 0         atomcount > 0 || cd->had_pruneorskip || inassert)
    0          
8846 0           return FALSE;
8847             }
8848              
8849             /* Check for explicit circumflex; anything else gives a FALSE result. Note
8850             in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8851             because the number of characters matched by .* cannot be adjusted inside
8852             them. */
8853              
8854 3 50         else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
    50          
8855              
8856             /* Move on to the next alternative */
8857              
8858 0           code += GET(code, 1);
8859             }
8860 0 0         while (*code == OP_ALT); /* Loop for each alternative */
8861 0           return TRUE;
8862             }
8863              
8864              
8865              
8866             /*************************************************
8867             * Check for asserted fixed first char *
8868             *************************************************/
8869              
8870             /* During compilation, the "first char" settings from forward assertions are
8871             discarded, because they can cause conflicts with actual literals that follow.
8872             However, if we end up without a first char setting for an unanchored pattern,
8873             it is worth scanning the regex to see if there is an initial asserted first
8874             char. If all branches start with the same asserted char, or with a
8875             non-conditional bracket all of whose alternatives start with the same asserted
8876             char (recurse ad lib), then we return that char, with the flags set to zero or
8877             REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8878              
8879             Arguments:
8880             code points to start of expression (the bracket)
8881             flags points to the first char flags, or to REQ_NONE
8882             inassert TRUE if in an assertion
8883              
8884             Returns: the fixed first char, or 0 with REQ_NONE in flags
8885             */
8886              
8887             static pcre_uint32
8888 3           find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8889             BOOL inassert)
8890             {
8891 3           register pcre_uint32 c = 0;
8892 3           int cflags = REQ_NONE;
8893              
8894 3           *flags = REQ_NONE;
8895             do {
8896             pcre_uint32 d;
8897             int dflags;
8898 3 50         int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
    50          
8899 6 50         *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
    50          
8900 3           const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8901             TRUE);
8902 3           register pcre_uchar op = *scode;
8903              
8904 3           switch(op)
8905             {
8906             default:
8907 3           return 0;
8908              
8909             case OP_BRA:
8910             case OP_BRAPOS:
8911             case OP_CBRA:
8912             case OP_SCBRA:
8913             case OP_CBRAPOS:
8914             case OP_SCBRAPOS:
8915             case OP_ASSERT:
8916             case OP_ONCE:
8917             case OP_ONCE_NC:
8918 0           d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8919 0 0         if (dflags < 0)
8920 0           return 0;
8921 0 0         if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
    0          
    0          
8922 0           break;
8923              
8924             case OP_EXACT:
8925 0           scode += IMM2_SIZE;
8926             /* Fall through */
8927              
8928             case OP_CHAR:
8929             case OP_PLUS:
8930             case OP_MINPLUS:
8931             case OP_POSPLUS:
8932 0 0         if (!inassert) return 0;
8933 0 0         if (cflags < 0) { c = scode[1]; cflags = 0; }
8934 0 0         else if (c != scode[1]) return 0;
8935 0           break;
8936              
8937             case OP_EXACTI:
8938 0           scode += IMM2_SIZE;
8939             /* Fall through */
8940              
8941             case OP_CHARI:
8942             case OP_PLUSI:
8943             case OP_MINPLUSI:
8944             case OP_POSPLUSI:
8945 0 0         if (!inassert) return 0;
8946 0 0         if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8947 0 0         else if (c != scode[1]) return 0;
8948 0           break;
8949             }
8950              
8951 0           code += GET(code, 1);
8952             }
8953 0 0         while (*code == OP_ALT);
8954              
8955 0           *flags = cflags;
8956 0           return c;
8957             }
8958              
8959              
8960              
8961             /*************************************************
8962             * Add an entry to the name/number table *
8963             *************************************************/
8964              
8965             /* This function is called between compiling passes to add an entry to the
8966             name/number table, maintaining alphabetical order. Checking for permitted
8967             and forbidden duplicates has already been done.
8968              
8969             Arguments:
8970             cd the compile data block
8971             name the name to add
8972             length the length of the name
8973             groupno the group number
8974              
8975             Returns: nothing
8976             */
8977              
8978             static void
8979 0           add_name(compile_data *cd, const pcre_uchar *name, int length,
8980             unsigned int groupno)
8981             {
8982             int i;
8983 0           pcre_uchar *slot = cd->name_table;
8984              
8985 0 0         for (i = 0; i < cd->names_found; i++)
8986             {
8987 0           int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8988 0 0         if (crc == 0 && slot[IMM2_SIZE+length] != 0)
    0          
8989 0           crc = -1; /* Current name is a substring */
8990              
8991             /* Make space in the table and break the loop for an earlier name. For a
8992             duplicate or later name, carry on. We do this for duplicates so that in the
8993             simple case (when ?(| is not used) they are in order of their numbers. In all
8994             cases they are in the order in which they appear in the pattern. */
8995              
8996 0 0         if (crc < 0)
8997             {
8998 0           memmove(slot + cd->name_entry_size, slot,
8999             IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
9000 0           break;
9001             }
9002              
9003             /* Continue the loop for a later or duplicate name */
9004              
9005 0           slot += cd->name_entry_size;
9006             }
9007              
9008 0           PUT2(slot, 0, groupno);
9009 0           memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
9010 0           slot[IMM2_SIZE + length] = 0;
9011 0           cd->names_found++;
9012 0           }
9013              
9014              
9015              
9016             /*************************************************
9017             * Compile a Regular Expression *
9018             *************************************************/
9019              
9020             /* This function takes a string and returns a pointer to a block of store
9021             holding a compiled version of the expression. The original API for this
9022             function had no error code return variable; it is retained for backwards
9023             compatibility. The new function is given a new name.
9024              
9025             Arguments:
9026             pattern the regular expression
9027             options various option bits
9028             errorcodeptr pointer to error code variable (pcre_compile2() only)
9029             can be NULL if you don't want a code value
9030             errorptr pointer to pointer to error text
9031             erroroffset ptr offset in pattern where error was detected
9032             tables pointer to character tables or NULL
9033              
9034             Returns: pointer to compiled data block, or NULL on error,
9035             with errorptr and erroroffset set
9036             */
9037              
9038             #if defined COMPILE_PCRE8
9039             PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9040 46           pcre_compile(const char *pattern, int options, const char **errorptr,
9041             int *erroroffset, const unsigned char *tables)
9042             #elif defined COMPILE_PCRE16
9043             PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9044             pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9045             int *erroroffset, const unsigned char *tables)
9046             #elif defined COMPILE_PCRE32
9047             PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9048             pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9049             int *erroroffset, const unsigned char *tables)
9050             #endif
9051             {
9052             #if defined COMPILE_PCRE8
9053 46           return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9054             #elif defined COMPILE_PCRE16
9055             return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9056             #elif defined COMPILE_PCRE32
9057             return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9058             #endif
9059             }
9060              
9061              
9062             #if defined COMPILE_PCRE8
9063             PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9064 46           pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9065             const char **errorptr, int *erroroffset, const unsigned char *tables)
9066             #elif defined COMPILE_PCRE16
9067             PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9068             pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9069             const char **errorptr, int *erroroffset, const unsigned char *tables)
9070             #elif defined COMPILE_PCRE32
9071             PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9072             pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9073             const char **errorptr, int *erroroffset, const unsigned char *tables)
9074             #endif
9075             {
9076             REAL_PCRE *re;
9077 46           int length = 1; /* For final END opcode */
9078             pcre_int32 firstcharflags, reqcharflags;
9079             pcre_uint32 firstchar, reqchar;
9080 46           pcre_uint32 limit_match = PCRE_UINT32_MAX;
9081 46           pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9082             int newline;
9083 46           int errorcode = 0;
9084 46           int skipatstart = 0;
9085             BOOL utf;
9086 46           BOOL never_utf = FALSE;
9087             size_t size;
9088             pcre_uchar *code;
9089             const pcre_uchar *codestart;
9090             const pcre_uchar *ptr;
9091             compile_data compile_block;
9092 46           compile_data *cd = &compile_block;
9093              
9094             /* This space is used for "compiling" into during the first phase, when we are
9095             computing the amount of memory that is needed. Compiled items are thrown away
9096             as soon as possible, so that a fairly large buffer should be sufficient for
9097             this purpose. The same space is used in the second phase for remembering where
9098             to fill in forward references to subpatterns. That may overflow, in which case
9099             new memory is obtained from malloc(). */
9100              
9101             pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9102              
9103             /* This vector is used for remembering name groups during the pre-compile. In a
9104             similar way to cworkspace, it can be expanded using malloc() if necessary. */
9105              
9106             named_group named_groups[NAMED_GROUP_LIST_SIZE];
9107              
9108             /* Set this early so that early errors get offset 0. */
9109              
9110 46           ptr = (const pcre_uchar *)pattern;
9111              
9112             /* We can't pass back an error message if errorptr is NULL; I guess the best we
9113             can do is just return NULL, but we can set a code value if there is a code
9114             pointer. */
9115              
9116 46 50         if (errorptr == NULL)
9117             {
9118 0 0         if (errorcodeptr != NULL) *errorcodeptr = 99;
9119 0           return NULL;
9120             }
9121              
9122 46           *errorptr = NULL;
9123 46 50         if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9124              
9125             /* However, we can give a message for this error */
9126              
9127 46 50         if (erroroffset == NULL)
9128             {
9129 0           errorcode = ERR16;
9130 0           goto PCRE_EARLY_ERROR_RETURN2;
9131             }
9132              
9133 46           *erroroffset = 0;
9134              
9135             /* Set up pointers to the individual character tables */
9136              
9137 46 50         if (tables == NULL) tables = PRIV(default_tables);
9138 46           cd->lcc = tables + lcc_offset;
9139 46           cd->fcc = tables + fcc_offset;
9140 46           cd->cbits = tables + cbits_offset;
9141 46           cd->ctypes = tables + ctypes_offset;
9142              
9143             /* Check that all undefined public option bits are zero */
9144              
9145 46 50         if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9146             {
9147 0           errorcode = ERR17;
9148 0           goto PCRE_EARLY_ERROR_RETURN;
9149             }
9150              
9151             /* If PCRE_NEVER_UTF is set, remember it. */
9152              
9153 46 50         if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9154              
9155             /* Check for global one-time settings at the start of the pattern, and remember
9156             the offset for later. */
9157              
9158 46           cd->external_flags = 0; /* Initialize here for LIMIT_MATCH/RECURSION */
9159              
9160 46 50         while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
    0          
9161 0           ptr[skipatstart+1] == CHAR_ASTERISK)
9162             {
9163 0           int newnl = 0;
9164 0           int newbsr = 0;
9165              
9166             /* For completeness and backward compatibility, (*UTFn) is supported in the
9167             relevant libraries, but (*UTF) is generic and always supported. Note that
9168             PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9169              
9170             #ifdef COMPILE_PCRE8
9171 0 0         if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9172 0           { skipatstart += 7; options |= PCRE_UTF8; continue; }
9173             #endif
9174             #ifdef COMPILE_PCRE16
9175             if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9176             { skipatstart += 8; options |= PCRE_UTF16; continue; }
9177             #endif
9178             #ifdef COMPILE_PCRE32
9179             if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9180             { skipatstart += 8; options |= PCRE_UTF32; continue; }
9181             #endif
9182              
9183 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9184 0           { skipatstart += 6; options |= PCRE_UTF8; continue; }
9185 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9186 0           { skipatstart += 6; options |= PCRE_UCP; continue; }
9187 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9188 0           { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9189 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9190 0           { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9191              
9192 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9193             {
9194 0           pcre_uint32 c = 0;
9195 0           int p = skipatstart + 14;
9196 0 0         while (isdigit(ptr[p]))
9197             {
9198 0 0         if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow */
9199 0           c = c*10 + ptr[p++] - CHAR_0;
9200             }
9201 0 0         if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9202 0 0         if (c < limit_match)
9203             {
9204 0           limit_match = c;
9205 0           cd->external_flags |= PCRE_MLSET;
9206             }
9207 0           skipatstart = p;
9208 0           continue;
9209             }
9210              
9211 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9212             {
9213 0           pcre_uint32 c = 0;
9214 0           int p = skipatstart + 18;
9215 0 0         while (isdigit(ptr[p]))
9216             {
9217 0 0         if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow check */
9218 0           c = c*10 + ptr[p++] - CHAR_0;
9219             }
9220 0 0         if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9221 0 0         if (c < limit_recursion)
9222             {
9223 0           limit_recursion = c;
9224 0           cd->external_flags |= PCRE_RLSET;
9225             }
9226 0           skipatstart = p;
9227 0           continue;
9228             }
9229              
9230 0 0         if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9231 0           { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9232 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0)
9233 0           { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9234 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0)
9235 0           { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9236 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9237 0           { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9238 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9239 0           { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9240              
9241 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9242 0           { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9243 0 0         else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9244 0           { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9245              
9246 0 0         if (newnl != 0)
9247 0           options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9248 0 0         else if (newbsr != 0)
9249 0           options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9250 0           else break;
9251             }
9252              
9253             /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9254 46           utf = (options & PCRE_UTF8) != 0;
9255 46 50         if (utf && never_utf)
    0          
9256             {
9257 0           errorcode = ERR78;
9258 0           goto PCRE_EARLY_ERROR_RETURN2;
9259             }
9260              
9261             /* Can't support UTF unless PCRE has been compiled to include the code. The
9262             return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9263             release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9264             not used here. */
9265              
9266             #ifdef SUPPORT_UTF
9267             if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9268             (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9269             {
9270             #if defined COMPILE_PCRE8
9271             errorcode = ERR44;
9272             #elif defined COMPILE_PCRE16
9273             errorcode = ERR74;
9274             #elif defined COMPILE_PCRE32
9275             errorcode = ERR77;
9276             #endif
9277             goto PCRE_EARLY_ERROR_RETURN2;
9278             }
9279             #else
9280 46 50         if (utf)
9281             {
9282 0           errorcode = ERR32;
9283 0           goto PCRE_EARLY_ERROR_RETURN;
9284             }
9285             #endif
9286              
9287             /* Can't support UCP unless PCRE has been compiled to include the code. */
9288              
9289             #ifndef SUPPORT_UCP
9290 46 50         if ((options & PCRE_UCP) != 0)
9291             {
9292 0           errorcode = ERR67;
9293 0           goto PCRE_EARLY_ERROR_RETURN;
9294             }
9295             #endif
9296              
9297             /* Check validity of \R options. */
9298              
9299 46 50         if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9300             (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9301             {
9302 0           errorcode = ERR56;
9303 0           goto PCRE_EARLY_ERROR_RETURN;
9304             }
9305              
9306             /* Handle different types of newline. The three bits give seven cases. The
9307             current code allows for fixed one- or two-byte sequences, plus "any" and
9308             "anycrlf". */
9309              
9310 46           switch (options & PCRE_NEWLINE_BITS)
9311             {
9312 46           case 0: newline = NEWLINE; break; /* Build-time default */
9313 0           case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9314 0           case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9315             case PCRE_NEWLINE_CR+
9316 0           PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9317 0           case PCRE_NEWLINE_ANY: newline = -1; break;
9318 0           case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9319 0           default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9320             }
9321              
9322 46 50         if (newline == -2)
9323             {
9324 46           cd->nltype = NLTYPE_ANYCRLF;
9325             }
9326 0 0         else if (newline < 0)
9327             {
9328 0           cd->nltype = NLTYPE_ANY;
9329             }
9330             else
9331             {
9332 0           cd->nltype = NLTYPE_FIXED;
9333 0 0         if (newline > 255)
9334             {
9335 0           cd->nllen = 2;
9336 0           cd->nl[0] = (newline >> 8) & 255;
9337 0           cd->nl[1] = newline & 255;
9338             }
9339             else
9340             {
9341 0           cd->nllen = 1;
9342 0           cd->nl[0] = newline;
9343             }
9344             }
9345              
9346             /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9347             references to help in deciding whether (.*) can be treated as anchored or not.
9348             */
9349              
9350 46           cd->top_backref = 0;
9351 46           cd->backref_map = 0;
9352              
9353             /* Reflect pattern for debugging output */
9354              
9355             DPRINTF(("------------------------------------------------------------------\n"));
9356             #ifdef PCRE_DEBUG
9357             print_puchar(stdout, (PCRE_PUCHAR)pattern);
9358             #endif
9359             DPRINTF(("\n"));
9360              
9361             /* Pretend to compile the pattern while actually just accumulating the length
9362             of memory required. This behaviour is triggered by passing a non-NULL final
9363             argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9364             to compile parts of the pattern into; the compiled code is discarded when it is
9365             no longer needed, so hopefully this workspace will never overflow, though there
9366             is a test for its doing so. */
9367              
9368 46           cd->bracount = cd->final_bracount = 0;
9369 46           cd->names_found = 0;
9370 46           cd->name_entry_size = 0;
9371 46           cd->name_table = NULL;
9372 46           cd->dupnames = FALSE;
9373 46           cd->dupgroups = FALSE;
9374 46           cd->namedrefcount = 0;
9375 46           cd->start_code = cworkspace;
9376 46           cd->hwm = cworkspace;
9377 46           cd->iscondassert = FALSE;
9378 46           cd->start_workspace = cworkspace;
9379 46           cd->workspace_size = COMPILE_WORK_SIZE;
9380 46           cd->named_groups = named_groups;
9381 46           cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9382 46           cd->start_pattern = (const pcre_uchar *)pattern;
9383 46           cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9384 46           cd->req_varyopt = 0;
9385 46           cd->parens_depth = 0;
9386 46           cd->assert_depth = 0;
9387 46           cd->max_lookbehind = 0;
9388 46           cd->external_options = options;
9389 46           cd->open_caps = NULL;
9390              
9391             /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9392             don't need to look at the result of the function here. The initial options have
9393             been put into the cd block so that they can be changed if an option setting is
9394             found within the regex right at the beginning. Bringing initial option settings
9395             outside can help speed up starting point checks. */
9396              
9397 46           ptr += skipatstart;
9398 46           code = cworkspace;
9399 46           *code = OP_BRA;
9400              
9401 46           (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9402             FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9403             cd, &length);
9404 46 50         if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9405              
9406             DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9407             (int)(cd->hwm - cworkspace)));
9408              
9409 46 50         if (length > MAX_PATTERN_SIZE)
9410             {
9411 0           errorcode = ERR20;
9412 0           goto PCRE_EARLY_ERROR_RETURN;
9413             }
9414              
9415             /* Compute the size of the data block for storing the compiled pattern. Integer
9416             overflow should no longer be possible because nowadays we limit the maximum
9417             value of cd->names_found and cd->name_entry_size. */
9418              
9419 46           size = sizeof(REAL_PCRE) +
9420 46           (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9421              
9422             /* Get the memory. */
9423              
9424 46           re = (REAL_PCRE *)(PUBL(malloc))(size);
9425 46 50         if (re == NULL)
9426             {
9427 0           errorcode = ERR21;
9428 0           goto PCRE_EARLY_ERROR_RETURN;
9429             }
9430              
9431             /* Put in the magic number, and save the sizes, initial options, internal
9432             flags, and character table pointer. NULL is used for the default character
9433             tables. The nullpad field is at the end; it's there to help in the case when a
9434             regex compiled on a system with 4-byte pointers is run on another with 8-byte
9435             pointers. */
9436              
9437 46           re->magic_number = MAGIC_NUMBER;
9438 46           re->size = (int)size;
9439 46           re->options = cd->external_options;
9440 46           re->flags = cd->external_flags;
9441 46           re->limit_match = limit_match;
9442 46           re->limit_recursion = limit_recursion;
9443 46           re->first_char = 0;
9444 46           re->req_char = 0;
9445 46           re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9446 46           re->name_entry_size = cd->name_entry_size;
9447 46           re->name_count = cd->names_found;
9448 46           re->ref_count = 0;
9449 46 50         re->tables = (tables == PRIV(default_tables))? NULL : tables;
9450 46           re->nullpad = NULL;
9451             #ifdef COMPILE_PCRE32
9452             re->dummy = 0;
9453             #else
9454 46           re->dummy1 = re->dummy2 = re->dummy3 = 0;
9455             #endif
9456              
9457             /* The starting points of the name/number translation table and of the code are
9458             passed around in the compile data block. The start/end pattern and initial
9459             options are already set from the pre-compile phase, as is the name_entry_size
9460             field. Reset the bracket count and the names_found field. Also reset the hwm
9461             field; this time it's used for remembering forward references to subpatterns.
9462             */
9463              
9464 46           cd->final_bracount = cd->bracount; /* Save for checking forward references */
9465 46           cd->parens_depth = 0;
9466 46           cd->assert_depth = 0;
9467 46           cd->bracount = 0;
9468 46           cd->max_lookbehind = 0;
9469 46           cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9470 46           codestart = cd->name_table + re->name_entry_size * re->name_count;
9471 46           cd->start_code = codestart;
9472 46           cd->hwm = (pcre_uchar *)(cd->start_workspace);
9473 46           cd->iscondassert = FALSE;
9474 46           cd->req_varyopt = 0;
9475 46           cd->had_accept = FALSE;
9476 46           cd->had_pruneorskip = FALSE;
9477 46           cd->check_lookbehind = FALSE;
9478 46           cd->open_caps = NULL;
9479              
9480             /* If any named groups were found, create the name/number table from the list
9481             created in the first pass. */
9482              
9483 46 50         if (cd->names_found > 0)
9484             {
9485 0           int i = cd->names_found;
9486 0           named_group *ng = cd->named_groups;
9487 0           cd->names_found = 0;
9488 0 0         for (; i > 0; i--, ng++)
9489 0           add_name(cd, ng->name, ng->length, ng->number);
9490 0 0         if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9491 0           (PUBL(free))((void *)cd->named_groups);
9492             }
9493              
9494             /* Set up a starting, non-extracting bracket, then compile the expression. On
9495             error, errorcode will be set non-zero, so we don't need to look at the result
9496             of the function here. */
9497              
9498 46           ptr = (const pcre_uchar *)pattern + skipatstart;
9499 46           code = (pcre_uchar *)codestart;
9500 46           *code = OP_BRA;
9501 46           (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9502             &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9503 46           re->top_bracket = cd->bracount;
9504 46           re->top_backref = cd->top_backref;
9505 46           re->max_lookbehind = cd->max_lookbehind;
9506 46           re->flags = cd->external_flags | PCRE_MODE;
9507              
9508 46 50         if (cd->had_accept)
9509             {
9510 0           reqchar = 0; /* Must disable after (*ACCEPT) */
9511 0           reqcharflags = REQ_NONE;
9512             }
9513              
9514             /* If not reached end of pattern on success, there's an excess bracket. */
9515              
9516 46 50         if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
    50          
9517              
9518             /* Fill in the terminating state and check for disastrous overflow, but
9519             if debugging, leave the test till after things are printed out. */
9520              
9521 46           *code++ = OP_END;
9522              
9523             #ifndef PCRE_DEBUG
9524 46 50         if (code - codestart > length) errorcode = ERR23;
9525             #endif
9526              
9527             #ifdef SUPPORT_VALGRIND
9528             /* If the estimated length exceeds the really used length, mark the extra
9529             allocated memory as unaddressable, so that any out-of-bound reads can be
9530             detected. */
9531             VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9532             #endif
9533              
9534             /* Fill in any forward references that are required. There may be repeated
9535             references; optimize for them, as searching a large regex takes time. */
9536              
9537 46 50         if (cd->hwm > cd->start_workspace)
9538             {
9539 0           int prev_recno = -1;
9540 0           const pcre_uchar *groupptr = NULL;
9541 0 0         while (errorcode == 0 && cd->hwm > cd->start_workspace)
    0          
9542             {
9543             int offset, recno;
9544 0           cd->hwm -= LINK_SIZE;
9545 0           offset = GET(cd->hwm, 0);
9546              
9547             /* Check that the hwm handling hasn't gone wrong. This whole area is
9548             rewritten in PCRE2 because there are some obscure cases. */
9549              
9550 0 0         if (offset == 0 || codestart[offset-1] != OP_RECURSE)
    0          
9551             {
9552 0           errorcode = ERR10;
9553 0           break;
9554             }
9555              
9556 0           recno = GET(codestart, offset);
9557 0 0         if (recno != prev_recno)
9558             {
9559 0           groupptr = PRIV(find_bracket)(codestart, utf, recno);
9560 0           prev_recno = recno;
9561             }
9562 0 0         if (groupptr == NULL) errorcode = ERR53;
9563 0           else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9564             }
9565             }
9566              
9567             /* If the workspace had to be expanded, free the new memory. Set the pointer to
9568             NULL to indicate that forward references have been filled in. */
9569              
9570 46 50         if (cd->workspace_size > COMPILE_WORK_SIZE)
9571 0           (PUBL(free))((void *)cd->start_workspace);
9572 46           cd->start_workspace = NULL;
9573              
9574             /* Give an error if there's back reference to a non-existent capturing
9575             subpattern. */
9576              
9577 46 50         if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
    50          
9578              
9579             /* Unless disabled, check whether any single character iterators can be
9580             auto-possessified. The function overwrites the appropriate opcode values, so
9581             the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9582             used in this code because at least one compiler gives a warning about loss of
9583             "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9584             function call. */
9585              
9586 46 50         if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
    50          
9587             {
9588 46           pcre_uchar *temp = (pcre_uchar *)codestart;
9589 46           auto_possessify(temp, utf, cd);
9590             }
9591              
9592             /* If there were any lookbehind assertions that contained OP_RECURSE
9593             (recursions or subroutine calls), a flag is set for them to be checked here,
9594             because they may contain forward references. Actual recursions cannot be fixed
9595             length, but subroutine calls can. It is done like this so that those without
9596             OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9597             exceptional ones forgo this. We scan the pattern to check that they are fixed
9598             length, and set their lengths. */
9599              
9600 46 50         if (errorcode == 0 && cd->check_lookbehind)
    50          
9601             {
9602 0           pcre_uchar *cc = (pcre_uchar *)codestart;
9603              
9604             /* Loop, searching for OP_REVERSE items, and process those that do not have
9605             their length set. (Actually, it will also re-process any that have a length
9606             of zero, but that is a pathological case, and it does no harm.) When we find
9607             one, we temporarily terminate the branch it is in while we scan it. */
9608              
9609 0 0         for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9610             cc != NULL;
9611 0           cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9612             {
9613 0 0         if (GET(cc, 1) == 0)
9614             {
9615             int fixed_length;
9616 0           pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9617 0           int end_op = *be;
9618 0           *be = OP_END;
9619 0           fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9620             cd, NULL);
9621 0           *be = end_op;
9622             DPRINTF(("fixed length = %d\n", fixed_length));
9623 0 0         if (fixed_length < 0)
9624             {
9625 0 0         errorcode = (fixed_length == -2)? ERR36 :
9626 0 0         (fixed_length == -4)? ERR70 : ERR25;
9627 0           break;
9628             }
9629 0 0         if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9630 0           PUT(cc, 1, fixed_length);
9631             }
9632 0           cc += 1 + LINK_SIZE;
9633             }
9634             }
9635              
9636             /* Failed to compile, or error while post-processing */
9637              
9638 46 50         if (errorcode != 0)
9639             {
9640 0           (PUBL(free))(re);
9641             PCRE_EARLY_ERROR_RETURN:
9642 0           *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9643             PCRE_EARLY_ERROR_RETURN2:
9644 0           *errorptr = find_error_text(errorcode);
9645 0 0         if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9646 0           return NULL;
9647             }
9648              
9649             /* If the anchored option was not passed, set the flag if we can determine that
9650             the pattern is anchored by virtue of ^ characters or \A or anything else, such
9651             as starting with non-atomic .* when DOTALL is set and there are no occurrences
9652             of *PRUNE or *SKIP.
9653              
9654             Otherwise, if we know what the first byte has to be, save it, because that
9655             speeds up unanchored matches no end. If not, see if we can set the
9656             PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9657             start with ^. and also when all branches start with non-atomic .* for
9658             non-DOTALL matches when *PRUNE and SKIP are not present. */
9659              
9660 46 50         if ((re->options & PCRE_ANCHORED) == 0)
9661             {
9662 46 100         if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9663             else
9664             {
9665 41 100         if (firstcharflags < 0)
9666 3           firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9667 41 100         if (firstcharflags >= 0) /* Remove caseless flag for non-caseable chars */
9668             {
9669             #if defined COMPILE_PCRE8
9670 38           re->first_char = firstchar & 0xff;
9671             #elif defined COMPILE_PCRE16
9672             re->first_char = firstchar & 0xffff;
9673             #elif defined COMPILE_PCRE32
9674             re->first_char = firstchar;
9675             #endif
9676 38 50         if ((firstcharflags & REQ_CASELESS) != 0)
9677             {
9678             #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9679             /* We ignore non-ASCII first chars in 8 bit mode. */
9680             if (utf)
9681             {
9682             if (re->first_char < 128)
9683             {
9684             if (cd->fcc[re->first_char] != re->first_char)
9685             re->flags |= PCRE_FCH_CASELESS;
9686             }
9687             else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9688             re->flags |= PCRE_FCH_CASELESS;
9689             }
9690             else
9691             #endif
9692 0 0         if (MAX_255(re->first_char)
9693 0           && cd->fcc[re->first_char] != re->first_char)
9694 0           re->flags |= PCRE_FCH_CASELESS;
9695             }
9696              
9697 38           re->flags |= PCRE_FIRSTSET;
9698             }
9699              
9700 3 50         else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
9701             }
9702             }
9703              
9704             /* For an anchored pattern, we use the "required byte" only if it follows a
9705             variable length item in the regex. Remove the caseless flag for non-caseable
9706             bytes. */
9707              
9708 46 100         if (reqcharflags >= 0 &&
    100          
9709 3 50         ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9710             {
9711             #if defined COMPILE_PCRE8
9712 41           re->req_char = reqchar & 0xff;
9713             #elif defined COMPILE_PCRE16
9714             re->req_char = reqchar & 0xffff;
9715             #elif defined COMPILE_PCRE32
9716             re->req_char = reqchar;
9717             #endif
9718 41 50         if ((reqcharflags & REQ_CASELESS) != 0)
9719             {
9720             #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9721             /* We ignore non-ASCII first chars in 8 bit mode. */
9722             if (utf)
9723             {
9724             if (re->req_char < 128)
9725             {
9726             if (cd->fcc[re->req_char] != re->req_char)
9727             re->flags |= PCRE_RCH_CASELESS;
9728             }
9729             else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9730             re->flags |= PCRE_RCH_CASELESS;
9731             }
9732             else
9733             #endif
9734 0 0         if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9735 0           re->flags |= PCRE_RCH_CASELESS;
9736             }
9737              
9738 41           re->flags |= PCRE_REQCHSET;
9739             }
9740              
9741             /* Print out the compiled data if debugging is enabled. This is never the
9742             case when building a production library. */
9743              
9744             #ifdef PCRE_DEBUG
9745             printf("Length = %d top_bracket = %d top_backref = %d\n",
9746             length, re->top_bracket, re->top_backref);
9747              
9748             printf("Options=%08x\n", re->options);
9749              
9750             if ((re->flags & PCRE_FIRSTSET) != 0)
9751             {
9752             pcre_uchar ch = re->first_char;
9753             const char *caseless =
9754             ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9755             if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9756             else printf("First char = \\x%02x%s\n", ch, caseless);
9757             }
9758              
9759             if ((re->flags & PCRE_REQCHSET) != 0)
9760             {
9761             pcre_uchar ch = re->req_char;
9762             const char *caseless =
9763             ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9764             if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9765             else printf("Req char = \\x%02x%s\n", ch, caseless);
9766             }
9767              
9768             #if defined COMPILE_PCRE8
9769             pcre_printint((pcre *)re, stdout, TRUE);
9770             #elif defined COMPILE_PCRE16
9771             pcre16_printint((pcre *)re, stdout, TRUE);
9772             #elif defined COMPILE_PCRE32
9773             pcre32_printint((pcre *)re, stdout, TRUE);
9774             #endif
9775              
9776             /* This check is done here in the debugging case so that the code that
9777             was compiled can be seen. */
9778              
9779             if (code - codestart > length)
9780             {
9781             (PUBL(free))(re);
9782             *errorptr = find_error_text(ERR23);
9783             *erroroffset = ptr - (pcre_uchar *)pattern;
9784             if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9785             return NULL;
9786             }
9787             #endif /* PCRE_DEBUG */
9788              
9789             /* Check for a pattern than can match an empty string, so that this information
9790             can be provided to applications. */
9791              
9792             do
9793             {
9794 46 100         if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9795             {
9796 5           re->flags |= PCRE_MATCH_EMPTY;
9797 5           break;
9798             }
9799 41           codestart += GET(codestart, 1);
9800             }
9801 41 50         while (*codestart == OP_ALT);
9802              
9803             #if defined COMPILE_PCRE8
9804 46           return (pcre *)re;
9805             #elif defined COMPILE_PCRE16
9806             return (pcre16 *)re;
9807             #elif defined COMPILE_PCRE32
9808             return (pcre32 *)re;
9809             #endif
9810             }
9811              
9812             /* End of pcre_compile.c */