File Coverage

deps/libgit2/deps/pcre/pcre_study.c
Criterion Covered Total %
statement 0 362 0.0
branch 0 170 0.0
condition n/a
subroutine n/a
pod n/a
total 0 532 0.0


line stmt bran cond sub pod time code
1             /*************************************************
2             * Perl-Compatible Regular Expressions *
3             *************************************************/
4              
5             /* PCRE is a library of functions to support regular expressions whose syntax
6             and semantics are as close as possible to those of the Perl 5 language.
7              
8             Written by Philip Hazel
9             Copyright (c) 1997-2012 University of Cambridge
10              
11             -----------------------------------------------------------------------------
12             Redistribution and use in source and binary forms, with or without
13             modification, are permitted provided that the following conditions are met:
14              
15             * Redistributions of source code must retain the above copyright notice,
16             this list of conditions and the following disclaimer.
17              
18             * Redistributions in binary form must reproduce the above copyright
19             notice, this list of conditions and the following disclaimer in the
20             documentation and/or other materials provided with the distribution.
21              
22             * Neither the name of the University of Cambridge nor the names of its
23             contributors may be used to endorse or promote products derived from
24             this software without specific prior written permission.
25              
26             THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27             AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28             IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29             ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30             LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31             CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33             INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34             CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35             ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36             POSSIBILITY OF SUCH DAMAGE.
37             -----------------------------------------------------------------------------
38             */
39              
40              
41             /* This module contains the external function pcre_study(), along with local
42             supporting functions. */
43              
44              
45             #ifdef HAVE_CONFIG_H
46             #include "config.h"
47             #endif
48              
49             #include "pcre_internal.h"
50              
51             #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52              
53             /* Returns from set_start_bits() */
54              
55             enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56              
57              
58              
59             /*************************************************
60             * Find the minimum subject length for a group *
61             *************************************************/
62              
63             /* Scan a parenthesized group and compute the minimum length of subject that
64             is needed to match it. This is a lower bound; it does not mean there is a
65             string of that length that matches. In UTF8 mode, the result is in characters
66             rather than bytes.
67              
68             Arguments:
69             re compiled pattern block
70             code pointer to start of group (the bracket)
71             startcode pointer to start of the whole pattern's code
72             options the compiling options
73             recurses chain of recurse_check to catch mutual recursion
74             countptr pointer to call count (to catch over complexity)
75              
76             Returns: the minimum length
77             -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
78             -2 internal error (missing capturing bracket)
79             -3 internal error (opcode not listed)
80             */
81              
82             static int
83 0           find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
84             const pcre_uchar *startcode, int options, recurse_check *recurses,
85             int *countptr)
86             {
87 0           int length = -1;
88             /* PCRE_UTF16 has the same value as PCRE_UTF8. */
89 0           BOOL utf = (options & PCRE_UTF8) != 0;
90 0           BOOL had_recurse = FALSE;
91             recurse_check this_recurse;
92 0           register int branchlength = 0;
93 0           register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
94              
95 0 0         if ((*countptr)++ > 1000) return -1; /* too complex */
96              
97 0 0         if (*code == OP_CBRA || *code == OP_SCBRA ||
    0          
    0          
98 0 0         *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
99              
100             /* Scan along the opcodes for this branch. If we get to the end of the
101             branch, check the length against that of the other branches. */
102              
103             for (;;)
104             {
105             int d, min;
106             pcre_uchar *cs, *ce;
107 0           register pcre_uchar op = *cc;
108              
109 0           switch (op)
110             {
111             case OP_COND:
112             case OP_SCOND:
113              
114             /* If there is only one branch in a condition, the implied branch has zero
115             length, so we don't add anything. This covers the DEFINE "condition"
116             automatically. */
117              
118 0           cs = cc + GET(cc, 1);
119 0 0         if (*cs != OP_ALT)
120             {
121 0           cc = cs + 1 + LINK_SIZE;
122 0           break;
123             }
124              
125             /* Otherwise we can fall through and treat it the same as any other
126             subpattern. */
127              
128             case OP_CBRA:
129             case OP_SCBRA:
130             case OP_BRA:
131             case OP_SBRA:
132             case OP_CBRAPOS:
133             case OP_SCBRAPOS:
134             case OP_BRAPOS:
135             case OP_SBRAPOS:
136             case OP_ONCE:
137             case OP_ONCE_NC:
138 0           d = find_minlength(re, cc, startcode, options, recurses, countptr);
139 0 0         if (d < 0) return d;
140 0           branchlength += d;
141 0 0         do cc += GET(cc, 1); while (*cc == OP_ALT);
142 0           cc += 1 + LINK_SIZE;
143 0           break;
144              
145             /* ACCEPT makes things far too complicated; we have to give up. */
146              
147             case OP_ACCEPT:
148             case OP_ASSERT_ACCEPT:
149 0           return -1;
150              
151             /* Reached end of a branch; if it's a ket it is the end of a nested
152             call. If it's ALT it is an alternation in a nested call. If it is END it's
153             the end of the outer call. All can be handled by the same code. If an
154             ACCEPT was previously encountered, use the length that was in force at that
155             time, and pass back the shortest ACCEPT length. */
156              
157             case OP_ALT:
158             case OP_KET:
159             case OP_KETRMAX:
160             case OP_KETRMIN:
161             case OP_KETRPOS:
162             case OP_END:
163 0 0         if (length < 0 || (!had_recurse && branchlength < length))
    0          
    0          
164 0           length = branchlength;
165 0 0         if (op != OP_ALT) return length;
166 0           cc += 1 + LINK_SIZE;
167 0           branchlength = 0;
168 0           had_recurse = FALSE;
169 0           break;
170              
171             /* Skip over assertive subpatterns */
172              
173             case OP_ASSERT:
174             case OP_ASSERT_NOT:
175             case OP_ASSERTBACK:
176             case OP_ASSERTBACK_NOT:
177 0 0         do cc += GET(cc, 1); while (*cc == OP_ALT);
178             /* Fall through */
179              
180             /* Skip over things that don't match chars */
181              
182             case OP_REVERSE:
183             case OP_CREF:
184             case OP_DNCREF:
185             case OP_RREF:
186             case OP_DNRREF:
187             case OP_DEF:
188             case OP_CALLOUT:
189             case OP_SOD:
190             case OP_SOM:
191             case OP_EOD:
192             case OP_EODN:
193             case OP_CIRC:
194             case OP_CIRCM:
195             case OP_DOLL:
196             case OP_DOLLM:
197             case OP_NOT_WORD_BOUNDARY:
198             case OP_WORD_BOUNDARY:
199 0           cc += PRIV(OP_lengths)[*cc];
200 0           break;
201              
202             /* Skip over a subpattern that has a {0} or {0,x} quantifier */
203              
204             case OP_BRAZERO:
205             case OP_BRAMINZERO:
206             case OP_BRAPOSZERO:
207             case OP_SKIPZERO:
208 0           cc += PRIV(OP_lengths)[*cc];
209 0 0         do cc += GET(cc, 1); while (*cc == OP_ALT);
210 0           cc += 1 + LINK_SIZE;
211 0           break;
212              
213             /* Handle literal characters and + repetitions */
214              
215             case OP_CHAR:
216             case OP_CHARI:
217             case OP_NOT:
218             case OP_NOTI:
219             case OP_PLUS:
220             case OP_PLUSI:
221             case OP_MINPLUS:
222             case OP_MINPLUSI:
223             case OP_POSPLUS:
224             case OP_POSPLUSI:
225             case OP_NOTPLUS:
226             case OP_NOTPLUSI:
227             case OP_NOTMINPLUS:
228             case OP_NOTMINPLUSI:
229             case OP_NOTPOSPLUS:
230             case OP_NOTPOSPLUSI:
231 0           branchlength++;
232 0           cc += 2;
233             #ifdef SUPPORT_UTF
234             if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
235             #endif
236 0           break;
237              
238             case OP_TYPEPLUS:
239             case OP_TYPEMINPLUS:
240             case OP_TYPEPOSPLUS:
241 0           branchlength++;
242 0 0         cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
    0          
243 0           break;
244              
245             /* Handle exact repetitions. The count is already in characters, but we
246             need to skip over a multibyte character in UTF8 mode. */
247              
248             case OP_EXACT:
249             case OP_EXACTI:
250             case OP_NOTEXACT:
251             case OP_NOTEXACTI:
252 0           branchlength += GET2(cc,1);
253 0           cc += 2 + IMM2_SIZE;
254             #ifdef SUPPORT_UTF
255             if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
256             #endif
257 0           break;
258              
259             case OP_TYPEEXACT:
260 0           branchlength += GET2(cc,1);
261 0 0         cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
262 0 0         || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
263 0           break;
264              
265             /* Handle single-char non-literal matchers */
266              
267             case OP_PROP:
268             case OP_NOTPROP:
269 0           cc += 2;
270             /* Fall through */
271              
272             case OP_NOT_DIGIT:
273             case OP_DIGIT:
274             case OP_NOT_WHITESPACE:
275             case OP_WHITESPACE:
276             case OP_NOT_WORDCHAR:
277             case OP_WORDCHAR:
278             case OP_ANY:
279             case OP_ALLANY:
280             case OP_EXTUNI:
281             case OP_HSPACE:
282             case OP_NOT_HSPACE:
283             case OP_VSPACE:
284             case OP_NOT_VSPACE:
285 0           branchlength++;
286 0           cc++;
287 0           break;
288              
289             /* "Any newline" might match two characters, but it also might match just
290             one. */
291              
292             case OP_ANYNL:
293 0           branchlength += 1;
294 0           cc++;
295 0           break;
296              
297             /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
298             non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
299             appear, but leave the code, just in case.) */
300              
301             case OP_ANYBYTE:
302             #ifdef SUPPORT_UTF
303             if (utf) return -1;
304             #endif
305 0           branchlength++;
306 0           cc++;
307 0           break;
308              
309             /* For repeated character types, we have to test for \p and \P, which have
310             an extra two bytes of parameters. */
311              
312             case OP_TYPESTAR:
313             case OP_TYPEMINSTAR:
314             case OP_TYPEQUERY:
315             case OP_TYPEMINQUERY:
316             case OP_TYPEPOSSTAR:
317             case OP_TYPEPOSQUERY:
318 0 0         if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
    0          
319 0           cc += PRIV(OP_lengths)[op];
320 0           break;
321              
322             case OP_TYPEUPTO:
323             case OP_TYPEMINUPTO:
324             case OP_TYPEPOSUPTO:
325 0 0         if (cc[1 + IMM2_SIZE] == OP_PROP
326 0 0         || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
327 0           cc += PRIV(OP_lengths)[op];
328 0           break;
329              
330             /* Check a class for variable quantification */
331              
332             case OP_CLASS:
333             case OP_NCLASS:
334             #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
335             case OP_XCLASS:
336             /* The original code caused an unsigned overflow in 64 bit systems,
337             so now we use a conditional statement. */
338             if (op == OP_XCLASS)
339             cc += GET(cc, 1);
340             else
341             cc += PRIV(OP_lengths)[OP_CLASS];
342             #else
343 0           cc += PRIV(OP_lengths)[OP_CLASS];
344             #endif
345              
346 0           switch (*cc)
347             {
348             case OP_CRPLUS:
349             case OP_CRMINPLUS:
350             case OP_CRPOSPLUS:
351 0           branchlength++;
352             /* Fall through */
353              
354             case OP_CRSTAR:
355             case OP_CRMINSTAR:
356             case OP_CRQUERY:
357             case OP_CRMINQUERY:
358             case OP_CRPOSSTAR:
359             case OP_CRPOSQUERY:
360 0           cc++;
361 0           break;
362              
363             case OP_CRRANGE:
364             case OP_CRMINRANGE:
365             case OP_CRPOSRANGE:
366 0           branchlength += GET2(cc,1);
367 0           cc += 1 + 2 * IMM2_SIZE;
368 0           break;
369              
370             default:
371 0           branchlength++;
372 0           break;
373             }
374 0           break;
375              
376             /* Backreferences and subroutine calls are treated in the same way: we find
377             the minimum length for the subpattern. A recursion, however, causes an
378             a flag to be set that causes the length of this branch to be ignored. The
379             logic is that a recursion can only make sense if there is another
380             alternation that stops the recursing. That will provide the minimum length
381             (when no recursion happens). A backreference within the group that it is
382             referencing behaves in the same way.
383              
384             If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
385             matches an empty string (by default it causes a matching failure), so in
386             that case we must set the minimum length to zero. */
387              
388             case OP_DNREF: /* Duplicate named pattern back reference */
389             case OP_DNREFI:
390 0 0         if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
391             {
392 0           int count = GET2(cc, 1+IMM2_SIZE);
393 0           pcre_uchar *slot = (pcre_uchar *)re +
394 0           re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
395 0           d = INT_MAX;
396 0 0         while (count-- > 0)
397             {
398 0           ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399 0 0         if (cs == NULL) return -2;
400 0 0         do ce += GET(ce, 1); while (*ce == OP_ALT);
401 0 0         if (cc > cs && cc < ce) /* Simple recursion */
    0          
402             {
403 0           d = 0;
404 0           had_recurse = TRUE;
405 0           break;
406             }
407             else
408             {
409 0           recurse_check *r = recurses;
410 0 0         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    0          
411 0 0         if (r != NULL) /* Mutual recursion */
412             {
413 0           d = 0;
414 0           had_recurse = TRUE;
415 0           break;
416             }
417             else
418             {
419             int dd;
420 0           this_recurse.prev = recurses;
421 0           this_recurse.group = cs;
422 0           dd = find_minlength(re, cs, startcode, options, &this_recurse,
423             countptr);
424 0 0         if (dd < d) d = dd;
425             }
426             }
427 0           slot += re->name_entry_size;
428             }
429             }
430 0           else d = 0;
431 0           cc += 1 + 2*IMM2_SIZE;
432 0           goto REPEAT_BACK_REFERENCE;
433              
434             case OP_REF: /* Single back reference */
435             case OP_REFI:
436 0 0         if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
437             {
438 0           ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
439 0 0         if (cs == NULL) return -2;
440 0 0         do ce += GET(ce, 1); while (*ce == OP_ALT);
441 0 0         if (cc > cs && cc < ce) /* Simple recursion */
    0          
442             {
443 0           d = 0;
444 0           had_recurse = TRUE;
445             }
446             else
447             {
448 0           recurse_check *r = recurses;
449 0 0         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    0          
450 0 0         if (r != NULL) /* Mutual recursion */
451             {
452 0           d = 0;
453 0           had_recurse = TRUE;
454             }
455             else
456             {
457 0           this_recurse.prev = recurses;
458 0           this_recurse.group = cs;
459 0           d = find_minlength(re, cs, startcode, options, &this_recurse,
460             countptr);
461             }
462             }
463             }
464 0           else d = 0;
465 0           cc += 1 + IMM2_SIZE;
466              
467             /* Handle repeated back references */
468              
469             REPEAT_BACK_REFERENCE:
470 0           switch (*cc)
471             {
472             case OP_CRSTAR:
473             case OP_CRMINSTAR:
474             case OP_CRQUERY:
475             case OP_CRMINQUERY:
476             case OP_CRPOSSTAR:
477             case OP_CRPOSQUERY:
478 0           min = 0;
479 0           cc++;
480 0           break;
481              
482             case OP_CRPLUS:
483             case OP_CRMINPLUS:
484             case OP_CRPOSPLUS:
485 0           min = 1;
486 0           cc++;
487 0           break;
488              
489             case OP_CRRANGE:
490             case OP_CRMINRANGE:
491             case OP_CRPOSRANGE:
492 0           min = GET2(cc, 1);
493 0           cc += 1 + 2 * IMM2_SIZE;
494 0           break;
495              
496             default:
497 0           min = 1;
498 0           break;
499             }
500              
501 0           branchlength += min * d;
502 0           break;
503              
504             /* We can easily detect direct recursion, but not mutual recursion. This is
505             caught by a recursion depth count. */
506              
507             case OP_RECURSE:
508 0           cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
509 0 0         do ce += GET(ce, 1); while (*ce == OP_ALT);
510 0 0         if (cc > cs && cc < ce) /* Simple recursion */
    0          
511 0           had_recurse = TRUE;
512             else
513             {
514 0           recurse_check *r = recurses;
515 0 0         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    0          
516 0 0         if (r != NULL) /* Mutual recursion */
517 0           had_recurse = TRUE;
518             else
519             {
520 0           this_recurse.prev = recurses;
521 0           this_recurse.group = cs;
522 0           branchlength += find_minlength(re, cs, startcode, options,
523             &this_recurse, countptr);
524             }
525             }
526 0           cc += 1 + LINK_SIZE;
527 0           break;
528              
529             /* Anything else does not or need not match a character. We can get the
530             item's length from the table, but for those that can match zero occurrences
531             of a character, we must take special action for UTF-8 characters. As it
532             happens, the "NOT" versions of these opcodes are used at present only for
533             ASCII characters, so they could be omitted from this list. However, in
534             future that may change, so we include them here so as not to leave a
535             gotcha for a future maintainer. */
536              
537             case OP_UPTO:
538             case OP_UPTOI:
539             case OP_NOTUPTO:
540             case OP_NOTUPTOI:
541             case OP_MINUPTO:
542             case OP_MINUPTOI:
543             case OP_NOTMINUPTO:
544             case OP_NOTMINUPTOI:
545             case OP_POSUPTO:
546             case OP_POSUPTOI:
547             case OP_NOTPOSUPTO:
548             case OP_NOTPOSUPTOI:
549              
550             case OP_STAR:
551             case OP_STARI:
552             case OP_NOTSTAR:
553             case OP_NOTSTARI:
554             case OP_MINSTAR:
555             case OP_MINSTARI:
556             case OP_NOTMINSTAR:
557             case OP_NOTMINSTARI:
558             case OP_POSSTAR:
559             case OP_POSSTARI:
560             case OP_NOTPOSSTAR:
561             case OP_NOTPOSSTARI:
562              
563             case OP_QUERY:
564             case OP_QUERYI:
565             case OP_NOTQUERY:
566             case OP_NOTQUERYI:
567             case OP_MINQUERY:
568             case OP_MINQUERYI:
569             case OP_NOTMINQUERY:
570             case OP_NOTMINQUERYI:
571             case OP_POSQUERY:
572             case OP_POSQUERYI:
573             case OP_NOTPOSQUERY:
574             case OP_NOTPOSQUERYI:
575              
576 0           cc += PRIV(OP_lengths)[op];
577             #ifdef SUPPORT_UTF
578             if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
579             #endif
580 0           break;
581              
582             /* Skip these, but we need to add in the name length. */
583              
584             case OP_MARK:
585             case OP_PRUNE_ARG:
586             case OP_SKIP_ARG:
587             case OP_THEN_ARG:
588 0           cc += PRIV(OP_lengths)[op] + cc[1];
589 0           break;
590              
591             /* The remaining opcodes are just skipped over. */
592              
593             case OP_CLOSE:
594             case OP_COMMIT:
595             case OP_FAIL:
596             case OP_PRUNE:
597             case OP_SET_SOM:
598             case OP_SKIP:
599             case OP_THEN:
600 0           cc += PRIV(OP_lengths)[op];
601 0           break;
602              
603             /* This should not occur: we list all opcodes explicitly so that when
604             new ones get added they are properly considered. */
605              
606             default:
607 0           return -3;
608             }
609 0           }
610             /* Control never gets here */
611             }
612              
613              
614              
615             /*************************************************
616             * Set a bit and maybe its alternate case *
617             *************************************************/
618              
619             /* Given a character, set its first byte's bit in the table, and also the
620             corresponding bit for the other version of a letter if we are caseless. In
621             UTF-8 mode, for characters greater than 127, we can only do the caseless thing
622             when Unicode property support is available.
623              
624             Arguments:
625             start_bits points to the bit map
626             p points to the character
627             caseless the caseless flag
628             cd the block with char table pointers
629             utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
630              
631             Returns: pointer after the character
632             */
633              
634             static const pcre_uchar *
635 0           set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
636             compile_data *cd, BOOL utf)
637             {
638 0           pcre_uint32 c = *p;
639              
640             #ifdef COMPILE_PCRE8
641 0           SET_BIT(c);
642              
643             #ifdef SUPPORT_UTF
644             if (utf && c > 127)
645             {
646             GETCHARINC(c, p);
647             #ifdef SUPPORT_UCP
648             if (caseless)
649             {
650             pcre_uchar buff[6];
651             c = UCD_OTHERCASE(c);
652             (void)PRIV(ord2utf)(c, buff);
653             SET_BIT(buff[0]);
654             }
655             #endif /* Not SUPPORT_UCP */
656             return p;
657             }
658             #else /* Not SUPPORT_UTF */
659             (void)(utf); /* Stops warning for unused parameter */
660             #endif /* SUPPORT_UTF */
661              
662             /* Not UTF-8 mode, or character is less than 127. */
663              
664 0 0         if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
    0          
665 0           return p + 1;
666             #endif /* COMPILE_PCRE8 */
667              
668             #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
669             if (c > 0xff)
670             {
671             c = 0xff;
672             caseless = FALSE;
673             }
674             SET_BIT(c);
675              
676             #ifdef SUPPORT_UTF
677             if (utf && c > 127)
678             {
679             GETCHARINC(c, p);
680             #ifdef SUPPORT_UCP
681             if (caseless)
682             {
683             c = UCD_OTHERCASE(c);
684             if (c > 0xff)
685             c = 0xff;
686             SET_BIT(c);
687             }
688             #endif /* SUPPORT_UCP */
689             return p;
690             }
691             #else /* Not SUPPORT_UTF */
692             (void)(utf); /* Stops warning for unused parameter */
693             #endif /* SUPPORT_UTF */
694              
695             if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
696             return p + 1;
697             #endif
698             }
699              
700              
701              
702             /*************************************************
703             * Set bits for a positive character type *
704             *************************************************/
705              
706             /* This function sets starting bits for a character type. In UTF-8 mode, we can
707             only do a direct setting for bytes less than 128, as otherwise there can be
708             confusion with bytes in the middle of UTF-8 characters. In a "traditional"
709             environment, the tables will only recognize ASCII characters anyway, but in at
710             least one Windows environment, some higher bytes bits were set in the tables.
711             So we deal with that case by considering the UTF-8 encoding.
712              
713             Arguments:
714             start_bits the starting bitmap
715             cbit type the type of character wanted
716             table_limit 32 for non-UTF-8; 16 for UTF-8
717             cd the block with char table pointers
718              
719             Returns: nothing
720             */
721              
722             static void
723 0           set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724             compile_data *cd)
725             {
726             register pcre_uint32 c;
727 0 0         for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
728             #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729             if (table_limit == 32) return;
730             for (c = 128; c < 256; c++)
731             {
732             if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
733             {
734             pcre_uchar buff[6];
735             (void)PRIV(ord2utf)(c, buff);
736             SET_BIT(buff[0]);
737             }
738             }
739             #endif
740 0           }
741              
742              
743             /*************************************************
744             * Set bits for a negative character type *
745             *************************************************/
746              
747             /* This function sets starting bits for a negative character type such as \D.
748             In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
749             otherwise there can be confusion with bytes in the middle of UTF-8 characters.
750             Unlike in the positive case, where we can set appropriate starting bits for
751             specific high-valued UTF-8 characters, in this case we have to set the bits for
752             all high-valued characters. The lowest is 0xc2, but we overkill by starting at
753             0xc0 (192) for simplicity.
754              
755             Arguments:
756             start_bits the starting bitmap
757             cbit type the type of character wanted
758             table_limit 32 for non-UTF-8; 16 for UTF-8
759             cd the block with char table pointers
760              
761             Returns: nothing
762             */
763              
764             static void
765 0           set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
766             compile_data *cd)
767             {
768             register pcre_uint32 c;
769 0 0         for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
770             #if defined SUPPORT_UTF && defined COMPILE_PCRE8
771             if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
772             #endif
773 0           }
774              
775              
776              
777             /*************************************************
778             * Create bitmap of starting bytes *
779             *************************************************/
780              
781             /* This function scans a compiled unanchored expression recursively and
782             attempts to build a bitmap of the set of possible starting bytes. As time goes
783             by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
784             useful for parenthesized groups in patterns such as (a*)b where the group
785             provides some optional starting bytes but scanning must continue at the outer
786             level to find at least one mandatory byte. At the outermost level, this
787             function fails unless the result is SSB_DONE.
788              
789             Arguments:
790             code points to an expression
791             start_bits points to a 32-byte table, initialized to 0
792             utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
793             cd the block with char table pointers
794              
795             Returns: SSB_FAIL => Failed to find any starting bytes
796             SSB_DONE => Found mandatory starting bytes
797             SSB_CONTINUE => Found optional starting bytes
798             SSB_UNKNOWN => Hit an unrecognized opcode
799             */
800              
801             static int
802 0           set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
803             compile_data *cd)
804             {
805             register pcre_uint32 c;
806 0           int yield = SSB_DONE;
807             #if defined SUPPORT_UTF && defined COMPILE_PCRE8
808             int table_limit = utf? 16:32;
809             #else
810 0           int table_limit = 32;
811             #endif
812              
813             #if 0
814             /* ========================================================================= */
815             /* The following comment and code was inserted in January 1999. In May 2006,
816             when it was observed to cause compiler warnings about unused values, I took it
817             out again. If anybody is still using OS/2, they will have to put it back
818             manually. */
819              
820             /* This next statement and the later reference to dummy are here in order to
821             trick the optimizer of the IBM C compiler for OS/2 into generating correct
822             code. Apparently IBM isn't going to fix the problem, and we would rather not
823             disable optimization (in this module it actually makes a big difference, and
824             the pcre module can use all the optimization it can get). */
825              
826             volatile int dummy;
827             /* ========================================================================= */
828             #endif
829              
830             do
831             {
832 0           BOOL try_next = TRUE;
833 0           const pcre_uchar *tcode = code + 1 + LINK_SIZE;
834              
835 0 0         if (*code == OP_CBRA || *code == OP_SCBRA ||
    0          
    0          
836 0 0         *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
837              
838 0 0         while (try_next) /* Loop for items in this branch */
839             {
840             int rc;
841              
842 0           switch(*tcode)
843             {
844             /* If we reach something we don't understand, it means a new opcode has
845             been created that hasn't been added to this code. Hopefully this problem
846             will be discovered during testing. */
847              
848             default:
849 0           return SSB_UNKNOWN;
850              
851             /* Fail for a valid opcode that implies no starting bits. */
852              
853             case OP_ACCEPT:
854             case OP_ASSERT_ACCEPT:
855             case OP_ALLANY:
856             case OP_ANY:
857             case OP_ANYBYTE:
858             case OP_CIRC:
859             case OP_CIRCM:
860             case OP_CLOSE:
861             case OP_COMMIT:
862             case OP_COND:
863             case OP_CREF:
864             case OP_DEF:
865             case OP_DNCREF:
866             case OP_DNREF:
867             case OP_DNREFI:
868             case OP_DNRREF:
869             case OP_DOLL:
870             case OP_DOLLM:
871             case OP_END:
872             case OP_EOD:
873             case OP_EODN:
874             case OP_EXTUNI:
875             case OP_FAIL:
876             case OP_MARK:
877             case OP_NOT:
878             case OP_NOTEXACT:
879             case OP_NOTEXACTI:
880             case OP_NOTI:
881             case OP_NOTMINPLUS:
882             case OP_NOTMINPLUSI:
883             case OP_NOTMINQUERY:
884             case OP_NOTMINQUERYI:
885             case OP_NOTMINSTAR:
886             case OP_NOTMINSTARI:
887             case OP_NOTMINUPTO:
888             case OP_NOTMINUPTOI:
889             case OP_NOTPLUS:
890             case OP_NOTPLUSI:
891             case OP_NOTPOSPLUS:
892             case OP_NOTPOSPLUSI:
893             case OP_NOTPOSQUERY:
894             case OP_NOTPOSQUERYI:
895             case OP_NOTPOSSTAR:
896             case OP_NOTPOSSTARI:
897             case OP_NOTPOSUPTO:
898             case OP_NOTPOSUPTOI:
899             case OP_NOTPROP:
900             case OP_NOTQUERY:
901             case OP_NOTQUERYI:
902             case OP_NOTSTAR:
903             case OP_NOTSTARI:
904             case OP_NOTUPTO:
905             case OP_NOTUPTOI:
906             case OP_NOT_HSPACE:
907             case OP_NOT_VSPACE:
908             case OP_PRUNE:
909             case OP_PRUNE_ARG:
910             case OP_RECURSE:
911             case OP_REF:
912             case OP_REFI:
913             case OP_REVERSE:
914             case OP_RREF:
915             case OP_SCOND:
916             case OP_SET_SOM:
917             case OP_SKIP:
918             case OP_SKIP_ARG:
919             case OP_SOD:
920             case OP_SOM:
921             case OP_THEN:
922             case OP_THEN_ARG:
923 0           return SSB_FAIL;
924              
925             /* A "real" property test implies no starting bits, but the fake property
926             PT_CLIST identifies a list of characters. These lists are short, as they
927             are used for characters with more than one "other case", so there is no
928             point in recognizing them for OP_NOTPROP. */
929              
930             case OP_PROP:
931 0 0         if (tcode[1] != PT_CLIST) return SSB_FAIL;
932             {
933 0           const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
934 0 0         while ((c = *p++) < NOTACHAR)
935             {
936             #if defined SUPPORT_UTF && defined COMPILE_PCRE8
937             if (utf)
938             {
939             pcre_uchar buff[6];
940             (void)PRIV(ord2utf)(c, buff);
941             c = buff[0];
942             }
943             #endif
944 0 0         if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
945             }
946             }
947 0           try_next = FALSE;
948 0           break;
949              
950             /* We can ignore word boundary tests. */
951              
952             case OP_WORD_BOUNDARY:
953             case OP_NOT_WORD_BOUNDARY:
954 0           tcode++;
955 0           break;
956              
957             /* If we hit a bracket or a positive lookahead assertion, recurse to set
958             bits from within the subpattern. If it can't find anything, we have to
959             give up. If it finds some mandatory character(s), we are done for this
960             branch. Otherwise, carry on scanning after the subpattern. */
961              
962             case OP_BRA:
963             case OP_SBRA:
964             case OP_CBRA:
965             case OP_SCBRA:
966             case OP_BRAPOS:
967             case OP_SBRAPOS:
968             case OP_CBRAPOS:
969             case OP_SCBRAPOS:
970             case OP_ONCE:
971             case OP_ONCE_NC:
972             case OP_ASSERT:
973 0           rc = set_start_bits(tcode, start_bits, utf, cd);
974 0 0         if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
    0          
975 0 0         if (rc == SSB_DONE) try_next = FALSE; else
976             {
977 0 0         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
978 0           tcode += 1 + LINK_SIZE;
979             }
980 0           break;
981              
982             /* If we hit ALT or KET, it means we haven't found anything mandatory in
983             this branch, though we might have found something optional. For ALT, we
984             continue with the next alternative, but we have to arrange that the final
985             result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
986             return SSB_CONTINUE: if this is the top level, that indicates failure,
987             but after a nested subpattern, it causes scanning to continue. */
988              
989             case OP_ALT:
990 0           yield = SSB_CONTINUE;
991 0           try_next = FALSE;
992 0           break;
993              
994             case OP_KET:
995             case OP_KETRMAX:
996             case OP_KETRMIN:
997             case OP_KETRPOS:
998 0           return SSB_CONTINUE;
999              
1000             /* Skip over callout */
1001              
1002             case OP_CALLOUT:
1003 0           tcode += 2 + 2*LINK_SIZE;
1004 0           break;
1005              
1006             /* Skip over lookbehind and negative lookahead assertions */
1007              
1008             case OP_ASSERT_NOT:
1009             case OP_ASSERTBACK:
1010             case OP_ASSERTBACK_NOT:
1011 0 0         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1012 0           tcode += 1 + LINK_SIZE;
1013 0           break;
1014              
1015             /* BRAZERO does the bracket, but carries on. */
1016              
1017             case OP_BRAZERO:
1018             case OP_BRAMINZERO:
1019             case OP_BRAPOSZERO:
1020 0           rc = set_start_bits(++tcode, start_bits, utf, cd);
1021 0 0         if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
    0          
1022             /* =========================================================================
1023             See the comment at the head of this function concerning the next line,
1024             which was an old fudge for the benefit of OS/2.
1025             dummy = 1;
1026             ========================================================================= */
1027 0 0         do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1028 0           tcode += 1 + LINK_SIZE;
1029 0           break;
1030              
1031             /* SKIPZERO skips the bracket. */
1032              
1033             case OP_SKIPZERO:
1034 0           tcode++;
1035 0 0         do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1036 0           tcode += 1 + LINK_SIZE;
1037 0           break;
1038              
1039             /* Single-char * or ? sets the bit and tries the next item */
1040              
1041             case OP_STAR:
1042             case OP_MINSTAR:
1043             case OP_POSSTAR:
1044             case OP_QUERY:
1045             case OP_MINQUERY:
1046             case OP_POSQUERY:
1047 0           tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1048 0           break;
1049              
1050             case OP_STARI:
1051             case OP_MINSTARI:
1052             case OP_POSSTARI:
1053             case OP_QUERYI:
1054             case OP_MINQUERYI:
1055             case OP_POSQUERYI:
1056 0           tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1057 0           break;
1058              
1059             /* Single-char upto sets the bit and tries the next */
1060              
1061             case OP_UPTO:
1062             case OP_MINUPTO:
1063             case OP_POSUPTO:
1064 0           tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1065 0           break;
1066              
1067             case OP_UPTOI:
1068             case OP_MINUPTOI:
1069             case OP_POSUPTOI:
1070 0           tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1071 0           break;
1072              
1073             /* At least one single char sets the bit and stops */
1074              
1075             case OP_EXACT:
1076 0           tcode += IMM2_SIZE;
1077             /* Fall through */
1078             case OP_CHAR:
1079             case OP_PLUS:
1080             case OP_MINPLUS:
1081             case OP_POSPLUS:
1082 0           (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1083 0           try_next = FALSE;
1084 0           break;
1085              
1086             case OP_EXACTI:
1087 0           tcode += IMM2_SIZE;
1088             /* Fall through */
1089             case OP_CHARI:
1090             case OP_PLUSI:
1091             case OP_MINPLUSI:
1092             case OP_POSPLUSI:
1093 0           (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1094 0           try_next = FALSE;
1095 0           break;
1096              
1097             /* Special spacing and line-terminating items. These recognize specific
1098             lists of characters. The difference between VSPACE and ANYNL is that the
1099             latter can match the two-character CRLF sequence, but that is not
1100             relevant for finding the first character, so their code here is
1101             identical. */
1102              
1103             case OP_HSPACE:
1104 0           SET_BIT(CHAR_HT);
1105 0           SET_BIT(CHAR_SPACE);
1106             #ifdef SUPPORT_UTF
1107             if (utf)
1108             {
1109             #ifdef COMPILE_PCRE8
1110             SET_BIT(0xC2); /* For U+00A0 */
1111             SET_BIT(0xE1); /* For U+1680, U+180E */
1112             SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1113             SET_BIT(0xE3); /* For U+3000 */
1114             #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1115             SET_BIT(0xA0);
1116             SET_BIT(0xFF); /* For characters > 255 */
1117             #endif /* COMPILE_PCRE[8|16|32] */
1118             }
1119             else
1120             #endif /* SUPPORT_UTF */
1121             {
1122             #ifndef EBCDIC
1123 0           SET_BIT(0xA0);
1124             #endif /* Not EBCDIC */
1125             #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1126             SET_BIT(0xFF); /* For characters > 255 */
1127             #endif /* COMPILE_PCRE[16|32] */
1128             }
1129 0           try_next = FALSE;
1130 0           break;
1131              
1132             case OP_ANYNL:
1133             case OP_VSPACE:
1134 0           SET_BIT(CHAR_LF);
1135 0           SET_BIT(CHAR_VT);
1136 0           SET_BIT(CHAR_FF);
1137 0           SET_BIT(CHAR_CR);
1138             #ifdef SUPPORT_UTF
1139             if (utf)
1140             {
1141             #ifdef COMPILE_PCRE8
1142             SET_BIT(0xC2); /* For U+0085 */
1143             SET_BIT(0xE2); /* For U+2028, U+2029 */
1144             #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145             SET_BIT(CHAR_NEL);
1146             SET_BIT(0xFF); /* For characters > 255 */
1147             #endif /* COMPILE_PCRE[8|16|32] */
1148             }
1149             else
1150             #endif /* SUPPORT_UTF */
1151             {
1152 0           SET_BIT(CHAR_NEL);
1153             #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1154             SET_BIT(0xFF); /* For characters > 255 */
1155             #endif
1156             }
1157 0           try_next = FALSE;
1158 0           break;
1159              
1160             /* Single character types set the bits and stop. Note that if PCRE_UCP
1161             is set, we do not see these op codes because \d etc are converted to
1162             properties. Therefore, these apply in the case when only characters less
1163             than 256 are recognized to match the types. */
1164              
1165             case OP_NOT_DIGIT:
1166 0           set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1167 0           try_next = FALSE;
1168 0           break;
1169              
1170             case OP_DIGIT:
1171 0           set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172 0           try_next = FALSE;
1173 0           break;
1174              
1175             /* The cbit_space table has vertical tab as whitespace; we no longer
1176             have to play fancy tricks because Perl added VT to its whitespace at
1177             release 5.18. PCRE added it at release 8.34. */
1178              
1179             case OP_NOT_WHITESPACE:
1180 0           set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181 0           try_next = FALSE;
1182 0           break;
1183              
1184             case OP_WHITESPACE:
1185 0           set_type_bits(start_bits, cbit_space, table_limit, cd);
1186 0           try_next = FALSE;
1187 0           break;
1188              
1189             case OP_NOT_WORDCHAR:
1190 0           set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1191 0           try_next = FALSE;
1192 0           break;
1193              
1194             case OP_WORDCHAR:
1195 0           set_type_bits(start_bits, cbit_word, table_limit, cd);
1196 0           try_next = FALSE;
1197 0           break;
1198              
1199             /* One or more character type fudges the pointer and restarts, knowing
1200             it will hit a single character type and stop there. */
1201              
1202             case OP_TYPEPLUS:
1203             case OP_TYPEMINPLUS:
1204             case OP_TYPEPOSPLUS:
1205 0           tcode++;
1206 0           break;
1207              
1208             case OP_TYPEEXACT:
1209 0           tcode += 1 + IMM2_SIZE;
1210 0           break;
1211              
1212             /* Zero or more repeats of character types set the bits and then
1213             try again. */
1214              
1215             case OP_TYPEUPTO:
1216             case OP_TYPEMINUPTO:
1217             case OP_TYPEPOSUPTO:
1218 0           tcode += IMM2_SIZE; /* Fall through */
1219              
1220             case OP_TYPESTAR:
1221             case OP_TYPEMINSTAR:
1222             case OP_TYPEPOSSTAR:
1223             case OP_TYPEQUERY:
1224             case OP_TYPEMINQUERY:
1225             case OP_TYPEPOSQUERY:
1226 0           switch(tcode[1])
1227             {
1228             default:
1229             case OP_ANY:
1230             case OP_ALLANY:
1231 0           return SSB_FAIL;
1232              
1233             case OP_HSPACE:
1234 0           SET_BIT(CHAR_HT);
1235 0           SET_BIT(CHAR_SPACE);
1236             #ifdef SUPPORT_UTF
1237             if (utf)
1238             {
1239             #ifdef COMPILE_PCRE8
1240             SET_BIT(0xC2); /* For U+00A0 */
1241             SET_BIT(0xE1); /* For U+1680, U+180E */
1242             SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1243             SET_BIT(0xE3); /* For U+3000 */
1244             #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1245             SET_BIT(0xA0);
1246             SET_BIT(0xFF); /* For characters > 255 */
1247             #endif /* COMPILE_PCRE[8|16|32] */
1248             }
1249             else
1250             #endif /* SUPPORT_UTF */
1251             #ifndef EBCDIC
1252 0           SET_BIT(0xA0);
1253             #endif /* Not EBCDIC */
1254 0           break;
1255              
1256             case OP_ANYNL:
1257             case OP_VSPACE:
1258 0           SET_BIT(CHAR_LF);
1259 0           SET_BIT(CHAR_VT);
1260 0           SET_BIT(CHAR_FF);
1261 0           SET_BIT(CHAR_CR);
1262             #ifdef SUPPORT_UTF
1263             if (utf)
1264             {
1265             #ifdef COMPILE_PCRE8
1266             SET_BIT(0xC2); /* For U+0085 */
1267             SET_BIT(0xE2); /* For U+2028, U+2029 */
1268             #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269             SET_BIT(CHAR_NEL);
1270             SET_BIT(0xFF); /* For characters > 255 */
1271             #endif /* COMPILE_PCRE16 */
1272             }
1273             else
1274             #endif /* SUPPORT_UTF */
1275 0           SET_BIT(CHAR_NEL);
1276 0           break;
1277              
1278             case OP_NOT_DIGIT:
1279 0           set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1280 0           break;
1281              
1282             case OP_DIGIT:
1283 0           set_type_bits(start_bits, cbit_digit, table_limit, cd);
1284 0           break;
1285              
1286             /* The cbit_space table has vertical tab as whitespace; we no longer
1287             have to play fancy tricks because Perl added VT to its whitespace at
1288             release 5.18. PCRE added it at release 8.34. */
1289              
1290             case OP_NOT_WHITESPACE:
1291 0           set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1292 0           break;
1293              
1294             case OP_WHITESPACE:
1295 0           set_type_bits(start_bits, cbit_space, table_limit, cd);
1296 0           break;
1297              
1298             case OP_NOT_WORDCHAR:
1299 0           set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1300 0           break;
1301              
1302             case OP_WORDCHAR:
1303 0           set_type_bits(start_bits, cbit_word, table_limit, cd);
1304 0           break;
1305             }
1306              
1307 0           tcode += 2;
1308 0           break;
1309              
1310             /* Character class where all the information is in a bit map: set the
1311             bits and either carry on or not, according to the repeat count. If it was
1312             a negative class, and we are operating with UTF-8 characters, any byte
1313             with a value >= 0xc4 is a potentially valid starter because it starts a
1314             character with a value > 255. */
1315              
1316             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1317             case OP_XCLASS:
1318             if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1319             return SSB_FAIL;
1320             /* All bits are set. */
1321             if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1322             return SSB_FAIL;
1323             #endif
1324             /* Fall through */
1325              
1326             case OP_NCLASS:
1327             #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1328             if (utf)
1329             {
1330             start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1331             memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1332             }
1333             #endif
1334             #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1335             SET_BIT(0xFF); /* For characters > 255 */
1336             #endif
1337             /* Fall through */
1338              
1339             case OP_CLASS:
1340             {
1341             pcre_uint8 *map;
1342             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1343             map = NULL;
1344             if (*tcode == OP_XCLASS)
1345             {
1346             if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1347             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1348             tcode += GET(tcode, 1);
1349             }
1350             else
1351             #endif
1352             {
1353 0           tcode++;
1354 0           map = (pcre_uint8 *)tcode;
1355 0           tcode += 32 / sizeof(pcre_uchar);
1356             }
1357              
1358             /* In UTF-8 mode, the bits in a bit map correspond to character
1359             values, not to byte values. However, the bit map we are constructing is
1360             for byte values. So we have to do a conversion for characters whose
1361             value is > 127. In fact, there are only two possible starting bytes for
1362             characters in the range 128 - 255. */
1363              
1364             #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1365             if (map != NULL)
1366             #endif
1367             {
1368             #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1369             if (utf)
1370             {
1371             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1372             for (c = 128; c < 256; c++)
1373             {
1374             if ((map[c/8] & (1 << (c&7))) != 0)
1375             {
1376             int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1377             start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1378             c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1379             }
1380             }
1381             }
1382             else
1383             #endif
1384             {
1385             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1386 0 0         for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1387             }
1388             }
1389              
1390             /* Advance past the bit map, and act on what follows. For a zero
1391             minimum repeat, continue; otherwise stop processing. */
1392              
1393 0           switch (*tcode)
1394             {
1395             case OP_CRSTAR:
1396             case OP_CRMINSTAR:
1397             case OP_CRQUERY:
1398             case OP_CRMINQUERY:
1399             case OP_CRPOSSTAR:
1400             case OP_CRPOSQUERY:
1401 0           tcode++;
1402 0           break;
1403              
1404             case OP_CRRANGE:
1405             case OP_CRMINRANGE:
1406             case OP_CRPOSRANGE:
1407 0 0         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1408 0           else try_next = FALSE;
1409 0           break;
1410              
1411             default:
1412 0           try_next = FALSE;
1413 0           break;
1414             }
1415             }
1416 0           break; /* End of bitmap class handling */
1417              
1418             } /* End of switch */
1419             } /* End of try_next loop */
1420              
1421 0           code += GET(code, 1); /* Advance to next branch */
1422             }
1423 0 0         while (*code == OP_ALT);
1424 0           return yield;
1425             }
1426              
1427              
1428              
1429              
1430              
1431             /*************************************************
1432             * Study a compiled expression *
1433             *************************************************/
1434              
1435             /* This function is handed a compiled expression that it must study to produce
1436             information that will speed up the matching. It returns a pcre[16]_extra block
1437             which then gets handed back to pcre_exec().
1438              
1439             Arguments:
1440             re points to the compiled expression
1441             options contains option bits
1442             errorptr points to where to place error messages;
1443             set NULL unless error
1444              
1445             Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1446             the appropriate flags set;
1447             NULL on error or if no optimization possible
1448             */
1449              
1450             #if defined COMPILE_PCRE8
1451             PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1452 0           pcre_study(const pcre *external_re, int options, const char **errorptr)
1453             #elif defined COMPILE_PCRE16
1454             PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1455             pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1456             #elif defined COMPILE_PCRE32
1457             PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1458             pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1459             #endif
1460             {
1461             int min;
1462 0           int count = 0;
1463 0           BOOL bits_set = FALSE;
1464             pcre_uint8 start_bits[32];
1465 0           PUBL(extra) *extra = NULL;
1466             pcre_study_data *study;
1467             const pcre_uint8 *tables;
1468             pcre_uchar *code;
1469             compile_data compile_block;
1470 0           const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1471              
1472              
1473 0           *errorptr = NULL;
1474              
1475 0 0         if (re == NULL || re->magic_number != MAGIC_NUMBER)
    0          
1476             {
1477 0           *errorptr = "argument is not a compiled regular expression";
1478 0           return NULL;
1479             }
1480              
1481 0 0         if ((re->flags & PCRE_MODE) == 0)
1482             {
1483             #if defined COMPILE_PCRE8
1484 0           *errorptr = "argument not compiled in 8 bit mode";
1485             #elif defined COMPILE_PCRE16
1486             *errorptr = "argument not compiled in 16 bit mode";
1487             #elif defined COMPILE_PCRE32
1488             *errorptr = "argument not compiled in 32 bit mode";
1489             #endif
1490 0           return NULL;
1491             }
1492              
1493 0 0         if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1494             {
1495 0           *errorptr = "unknown or incorrect option bit(s) set";
1496 0           return NULL;
1497             }
1498              
1499 0           code = (pcre_uchar *)re + re->name_table_offset +
1500 0           (re->name_count * re->name_entry_size);
1501              
1502             /* For an anchored pattern, or an unanchored pattern that has a first char, or
1503             a multiline pattern that matches only at "line starts", there is no point in
1504             seeking a list of starting bytes. */
1505              
1506 0 0         if ((re->options & PCRE_ANCHORED) == 0 &&
    0          
1507 0           (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1508             {
1509             int rc;
1510              
1511             /* Set the character tables in the block that is passed around */
1512              
1513 0           tables = re->tables;
1514              
1515             #if defined COMPILE_PCRE8
1516 0 0         if (tables == NULL)
1517 0           (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1518             (void *)(&tables));
1519             #elif defined COMPILE_PCRE16
1520             if (tables == NULL)
1521             (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1522             (void *)(&tables));
1523             #elif defined COMPILE_PCRE32
1524             if (tables == NULL)
1525             (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1526             (void *)(&tables));
1527             #endif
1528              
1529 0           compile_block.lcc = tables + lcc_offset;
1530 0           compile_block.fcc = tables + fcc_offset;
1531 0           compile_block.cbits = tables + cbits_offset;
1532 0           compile_block.ctypes = tables + ctypes_offset;
1533              
1534             /* See if we can find a fixed set of initial characters for the pattern. */
1535              
1536 0           memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1537 0           rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1538             &compile_block);
1539 0           bits_set = rc == SSB_DONE;
1540 0 0         if (rc == SSB_UNKNOWN)
1541             {
1542 0           *errorptr = "internal error: opcode not recognized";
1543 0           return NULL;
1544             }
1545             }
1546              
1547             /* Find the minimum length of subject string. */
1548              
1549 0           switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1550             {
1551 0           case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1552 0           case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1553 0           default: break;
1554             }
1555              
1556             /* If a set of starting bytes has been identified, or if the minimum length is
1557             greater than zero, or if JIT optimization has been requested, or if
1558             PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1559             pcre_study_data block. The study data is put in the latter, which is pointed to
1560             by the former, which may also get additional data set later by the calling
1561             program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1562             save it in a field for returning via the pcre_fullinfo() function so that if it
1563             becomes variable in the future, we don't have to change that code. */
1564              
1565 0 0         if (bits_set || min > 0 || (options & (
    0          
    0          
1566             #ifdef SUPPORT_JIT
1567             PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1568             PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1569             #endif
1570             PCRE_STUDY_EXTRA_NEEDED)) != 0)
1571             {
1572 0           extra = (PUBL(extra) *)(PUBL(malloc))
1573             (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1574 0 0         if (extra == NULL)
1575             {
1576 0           *errorptr = "failed to get memory";
1577 0           return NULL;
1578             }
1579              
1580 0           study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1581 0           extra->flags = PCRE_EXTRA_STUDY_DATA;
1582 0           extra->study_data = study;
1583              
1584 0           study->size = sizeof(pcre_study_data);
1585 0           study->flags = 0;
1586              
1587             /* Set the start bits always, to avoid unset memory errors if the
1588             study data is written to a file, but set the flag only if any of the bits
1589             are set, to save time looking when none are. */
1590              
1591 0 0         if (bits_set)
1592             {
1593 0           study->flags |= PCRE_STUDY_MAPPED;
1594 0           memcpy(study->start_bits, start_bits, sizeof(start_bits));
1595             }
1596 0           else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1597              
1598             #ifdef PCRE_DEBUG
1599             if (bits_set)
1600             {
1601             pcre_uint8 *ptr = start_bits;
1602             int i;
1603              
1604             printf("Start bits:\n");
1605             for (i = 0; i < 32; i++)
1606             printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1607             }
1608             #endif
1609              
1610             /* Always set the minlength value in the block, because the JIT compiler
1611             makes use of it. However, don't set the bit unless the length is greater than
1612             zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1613             checking the zero case. */
1614              
1615 0 0         if (min > 0)
1616             {
1617 0           study->flags |= PCRE_STUDY_MINLEN;
1618 0           study->minlength = min;
1619             }
1620 0           else study->minlength = 0;
1621              
1622             /* If JIT support was compiled and requested, attempt the JIT compilation.
1623             If no starting bytes were found, and the minimum length is zero, and JIT
1624             compilation fails, abandon the extra block and return NULL, unless
1625             PCRE_STUDY_EXTRA_NEEDED is set. */
1626              
1627             #ifdef SUPPORT_JIT
1628             extra->executable_jit = NULL;
1629             if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1630             PRIV(jit_compile)(re, extra, JIT_COMPILE);
1631             if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1632             PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1633             if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1634             PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1635              
1636             if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1637             (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1638             {
1639             #if defined COMPILE_PCRE8
1640             pcre_free_study(extra);
1641             #elif defined COMPILE_PCRE16
1642             pcre16_free_study(extra);
1643             #elif defined COMPILE_PCRE32
1644             pcre32_free_study(extra);
1645             #endif
1646             extra = NULL;
1647             }
1648             #endif
1649             }
1650              
1651 0           return extra;
1652             }
1653              
1654              
1655             /*************************************************
1656             * Free the study data *
1657             *************************************************/
1658              
1659             /* This function frees the memory that was obtained by pcre_study().
1660              
1661             Argument: a pointer to the pcre[16]_extra block
1662             Returns: nothing
1663             */
1664              
1665             #if defined COMPILE_PCRE8
1666             PCRE_EXP_DEFN void
1667 0           pcre_free_study(pcre_extra *extra)
1668             #elif defined COMPILE_PCRE16
1669             PCRE_EXP_DEFN void
1670             pcre16_free_study(pcre16_extra *extra)
1671             #elif defined COMPILE_PCRE32
1672             PCRE_EXP_DEFN void
1673             pcre32_free_study(pcre32_extra *extra)
1674             #endif
1675             {
1676 0 0         if (extra == NULL)
1677 0           return;
1678             #ifdef SUPPORT_JIT
1679             if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1680             extra->executable_jit != NULL)
1681             PRIV(jit_free)(extra->executable_jit);
1682             #endif
1683 0           PUBL(free)(extra);
1684             }
1685              
1686             /* End of pcre_study.c */