File Coverage

regexec.c
Criterion Covered Total %
statement 548 1709 32.0
branch 334 1604 20.8
condition n/a
subroutine n/a
pod n/a
total 882 3313 26.6


line stmt bran cond sub pod time code
1             /* Extended regular expression matching and search library.
2             Copyright (C) 2002-2014 Free Software Foundation, Inc.
3             This file is part of the GNU C Library.
4             Contributed by Isamu Hasegawa .
5              
6             The GNU C Library is free software; you can redistribute it and/or
7             modify it under the terms of the GNU Lesser General Public
8             License as published by the Free Software Foundation; either
9             version 2.1 of the License, or (at your option) any later version.
10              
11             The GNU C Library is distributed in the hope that it will be useful,
12             but WITHOUT ANY WARRANTY; without even the implied warranty of
13             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14             Lesser General Public License for more details.
15              
16             You should have received a copy of the GNU Lesser General Public
17             License along with the GNU C Library; if not, see
18             . */
19              
20             static reg_errcode_t match_ctx_init (pTHX_ re_match_context_t *cache, int eflags,
21             Idx n) internal_function;
22             static void match_ctx_clean (pTHX_ re_match_context_t *mctx) internal_function;
23             static void match_ctx_free (pTHX_ re_match_context_t *cache) internal_function;
24             static reg_errcode_t match_ctx_add_entry (pTHX_ re_match_context_t *cache, Idx node,
25             Idx str_idx, Idx from, Idx to)
26             internal_function;
27             static Idx search_cur_bkref_entry (pTHX_ const re_match_context_t *mctx, Idx str_idx)
28             internal_function;
29             static reg_errcode_t match_ctx_add_subtop (pTHX_ re_match_context_t *mctx, Idx node,
30             Idx str_idx) internal_function;
31             static re_sub_match_last_t * match_ctx_add_sublast (pTHX_ re_sub_match_top_t *subtop,
32             Idx node, Idx str_idx)
33             internal_function;
34             static void sift_ctx_init (pTHX_ re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35             re_dfastate_t **limited_sts, Idx last_node,
36             Idx last_str_idx)
37             internal_function;
38             static reg_errcode_t re_search_internal (pTHX_ const regex_t *preg,
39             const char *string, Idx length, SV *sv,
40             Idx start, Idx last_start, Idx stop,
41             size_t nmatch, regmatch_t pmatch[],
42             int eflags) internal_function;
43             static regoff_t re_search_2_stub (pTHX_ struct re_pattern_buffer *bufp,
44             const char *string1, Idx length1, SV *sv1,
45             const char *string2, Idx length2, SV *sv2,
46             Idx start, regoff_t range,
47             struct re_registers *regs,
48             Idx stop, bool ret_len) internal_function;
49             static regoff_t re_search_stub (pTHX_ struct re_pattern_buffer *bufp,
50             const char *string, Idx length, SV *sv, Idx start,
51             regoff_t range, Idx stop,
52             struct re_registers *regs,
53             bool ret_len) internal_function;
54             static unsigned re_copy_regs (pTHX_ struct re_registers *regs, regmatch_t *pmatch,
55             Idx nregs, int regs_allocated) internal_function;
56             static reg_errcode_t prune_impossible_nodes (pTHX_ re_match_context_t *mctx)
57             internal_function;
58             static Idx check_matching (pTHX_ re_match_context_t *mctx, bool fl_longest_match,
59             Idx *p_match_first) internal_function;
60             static Idx check_halt_state_context (pTHX_ const re_match_context_t *mctx,
61             const re_dfastate_t *state, Idx idx)
62             internal_function;
63             static void update_regs (pTHX_ const re_dfa_t *dfa, regmatch_t *pmatch,
64             regmatch_t *prev_idx_match, Idx cur_node,
65             Idx cur_idx, Idx nmatch) internal_function;
66             static reg_errcode_t push_fail_stack (pTHX_ struct re_fail_stack_t *fs,
67             Idx str_idx, Idx dest_node, Idx nregs,
68             regmatch_t *regs,
69             re_node_set *eps_via_nodes)
70             internal_function;
71             static reg_errcode_t set_regs (pTHX_ const regex_t *preg,
72             const re_match_context_t *mctx,
73             size_t nmatch, regmatch_t *pmatch,
74             bool fl_backtrack) internal_function;
75             static reg_errcode_t free_fail_stack_return (pTHX_ struct re_fail_stack_t *fs)
76             internal_function;
77              
78             #ifdef RE_ENABLE_I18N
79             static int sift_states_iter_mb (pTHX_ const re_match_context_t *mctx,
80             re_sift_context_t *sctx,
81             Idx node_idx, Idx str_idx, Idx max_str_idx)
82             internal_function;
83             #endif /* RE_ENABLE_I18N */
84             static reg_errcode_t sift_states_backward (pTHX_ const re_match_context_t *mctx,
85             re_sift_context_t *sctx)
86             internal_function;
87             static reg_errcode_t build_sifted_states (pTHX_ const re_match_context_t *mctx,
88             re_sift_context_t *sctx, Idx str_idx,
89             re_node_set *cur_dest)
90             internal_function;
91             static reg_errcode_t update_cur_sifted_state (pTHX_ const re_match_context_t *mctx,
92             re_sift_context_t *sctx,
93             Idx str_idx,
94             re_node_set *dest_nodes)
95             internal_function;
96             static reg_errcode_t add_epsilon_src_nodes (pTHX_ const re_dfa_t *dfa,
97             re_node_set *dest_nodes,
98             const re_node_set *candidates)
99             internal_function;
100             static bool check_dst_limits (pTHX_ const re_match_context_t *mctx,
101             const re_node_set *limits,
102             Idx dst_node, Idx dst_idx, Idx src_node,
103             Idx src_idx) internal_function;
104             static int check_dst_limits_calc_pos_1 (pTHX_ const re_match_context_t *mctx,
105             int boundaries, Idx subexp_idx,
106             Idx from_node, Idx bkref_idx)
107             internal_function;
108             static int check_dst_limits_calc_pos (pTHX_ const re_match_context_t *mctx,
109             Idx limit, Idx subexp_idx,
110             Idx node, Idx str_idx,
111             Idx bkref_idx) internal_function;
112             static reg_errcode_t check_subexp_limits (pTHX_ const re_dfa_t *dfa,
113             re_node_set *dest_nodes,
114             const re_node_set *candidates,
115             re_node_set *limits,
116             struct re_backref_cache_entry *bkref_ents,
117             Idx str_idx) internal_function;
118             static reg_errcode_t sift_states_bkref (pTHX_ const re_match_context_t *mctx,
119             re_sift_context_t *sctx,
120             Idx str_idx, const re_node_set *candidates)
121             internal_function;
122             static reg_errcode_t merge_state_array (pTHX_ const re_dfa_t *dfa,
123             re_dfastate_t **dst,
124             re_dfastate_t **src, Idx num)
125             internal_function;
126             static re_dfastate_t *find_recover_state (pTHX_ reg_errcode_t *err,
127             re_match_context_t *mctx) internal_function;
128             static re_dfastate_t *transit_state (pTHX_ reg_errcode_t *err,
129             re_match_context_t *mctx,
130             re_dfastate_t *state) internal_function;
131             static re_dfastate_t *merge_state_with_log (pTHX_ reg_errcode_t *err,
132             re_match_context_t *mctx,
133             re_dfastate_t *next_state)
134             internal_function;
135             static reg_errcode_t check_subexp_matching_top (pTHX_ re_match_context_t *mctx,
136             re_node_set *cur_nodes,
137             Idx str_idx) internal_function;
138             #if 0
139             static re_dfastate_t *transit_state_sb (pTHX_ reg_errcode_t *err,
140             re_match_context_t *mctx,
141             re_dfastate_t *pstate)
142             internal_function;
143             #endif
144             #ifdef RE_ENABLE_I18N
145             static reg_errcode_t transit_state_mb (pTHX_ re_match_context_t *mctx,
146             re_dfastate_t *pstate)
147             internal_function;
148             #endif /* RE_ENABLE_I18N */
149             static reg_errcode_t transit_state_bkref (pTHX_ re_match_context_t *mctx,
150             const re_node_set *nodes)
151             internal_function;
152             static reg_errcode_t get_subexp (pTHX_ re_match_context_t *mctx,
153             Idx bkref_node, Idx bkref_str_idx)
154             internal_function;
155             static reg_errcode_t get_subexp_sub (pTHX_ re_match_context_t *mctx,
156             const re_sub_match_top_t *sub_top,
157             re_sub_match_last_t *sub_last,
158             Idx bkref_node, Idx bkref_str)
159             internal_function;
160             static Idx find_subexp_node (pTHX_ const re_dfa_t *dfa, const re_node_set *nodes,
161             Idx subexp_idx, int type) internal_function;
162             static reg_errcode_t check_arrival (pTHX_ re_match_context_t *mctx,
163             state_array_t *path, Idx top_node,
164             Idx top_str, Idx last_node, Idx last_str,
165             int type) internal_function;
166             static reg_errcode_t check_arrival_add_next_nodes (pTHX_ re_match_context_t *mctx,
167             Idx str_idx,
168             re_node_set *cur_nodes,
169             re_node_set *next_nodes)
170             internal_function;
171             static reg_errcode_t check_arrival_expand_ecl (pTHX_ const re_dfa_t *dfa,
172             re_node_set *cur_nodes,
173             Idx ex_subexp, int type)
174             internal_function;
175             static reg_errcode_t check_arrival_expand_ecl_sub (pTHX_ const re_dfa_t *dfa,
176             re_node_set *dst_nodes,
177             Idx target, Idx ex_subexp,
178             int type) internal_function;
179             static reg_errcode_t expand_bkref_cache (pTHX_ re_match_context_t *mctx,
180             re_node_set *cur_nodes, Idx cur_str,
181             Idx subexp_num, int type)
182             internal_function;
183             static bool build_trtable (pTHX_ const re_dfa_t *dfa,
184             re_dfastate_t *state) internal_function;
185             #ifdef RE_ENABLE_I18N
186             static int check_node_accept_bytes (pTHX_ const re_dfa_t *dfa, Idx node_idx,
187             const re_string_t *input, SV *sv, Idx idx)
188             internal_function;
189             # ifdef _LIBC
190             static unsigned int find_collation_sequence_value (pTHX_ const unsigned char *mbs,
191             size_t name_len)
192             internal_function;
193             # endif /* _LIBC */
194             #endif /* RE_ENABLE_I18N */
195             static Idx group_nodes_into_DFAstates (pTHX_ const re_dfa_t *dfa,
196             const re_dfastate_t *state,
197             re_node_set *states_node,
198             bitset_t *states_ch) internal_function;
199             static bool check_node_accept (pTHX_ const re_match_context_t *mctx,
200             const re_token_t *node, Idx idx)
201             internal_function;
202             static reg_errcode_t extend_buffers (pTHX_ re_match_context_t *mctx, Idx min_len)
203             internal_function;
204            
205             /* Entry point for POSIX code. */
206              
207             /* regexec searches for a given pattern, specified by PREG, in the
208             string STRING.
209              
210             If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211             'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212             least NMATCH elements, and we set them to the offsets of the
213             corresponding matched substrings.
214              
215             EFLAGS specifies "execution flags" which affect matching: if
216             REG_NOTBOL is set, then ^ does not match at the beginning of the
217             string; if REG_NOTEOL is set, then $ does not match at the end.
218              
219             We return 0 if we find a match and REG_NOMATCH if not. */
220              
221             int
222 0           regexec (pTHX_ const regex_t *_Restrict_ preg, const char *_Restrict_ string, SV *sv, size_t nmatch, regmatch_t pmatch[_Restrict_arr_], int eflags)
223             {
224             reg_errcode_t err;
225             Idx start, length;
226 0           re_dfa_t *dfa = preg->buffer;
227              
228 0 0         if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
229 0           return REG_BADPAT;
230              
231 0 0         if (eflags & REG_STARTEND)
232             {
233 0           start = pmatch[0].rm_so;
234 0           length = pmatch[0].rm_eo;
235             }
236             else
237             {
238 0           start = 0;
239 0           length = strlen (string);
240             }
241              
242             lock_lock (dfa->lock);
243 0 0         if (preg->no_sub)
244 0           err = re_search_internal (aTHX_ preg, string, length, sv, start, length,
245             length, 0, NULL, eflags);
246             else
247 0           err = re_search_internal (aTHX_ preg, string, length, sv, start, length,
248             length, nmatch, pmatch, eflags);
249             lock_unlock (dfa->lock);
250 0           return err != REG_NOERROR;
251             }
252              
253             #ifdef _LIBC
254             # include
255             versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
256              
257             # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
258             __typeof__ (__regexec) __compat_regexec;
259              
260             int
261             attribute_compat_text_section
262             __compat_regexec (pTHX_ const regex_t *_Restrict_ preg,
263             const char *_Restrict_ string, size_t nmatch,
264             regmatch_t pmatch[], int eflags)
265             {
266             return regexec (preg, string, nmatch, pmatch,
267             eflags & (REG_NOTBOL | REG_NOTEOL));
268             }
269             compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
270             # endif
271             #endif
272              
273             /* Entry points for GNU code. */
274              
275             /* re_match, re_search, re_match_2, re_search_2
276              
277             The former two functions operate on STRING with length LENGTH,
278             while the later two operate on concatenation of STRING1 and STRING2
279             with lengths LENGTH1 and LENGTH2, respectively.
280              
281             re_match() matches the compiled pattern in BUFP against the string,
282             starting at index START.
283              
284             re_search() first tries matching at index START, then it tries to match
285             starting from index START + 1, and so on. The last start position tried
286             is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
287             way as re_match().)
288              
289             The parameter STOP of re_{match,search}_2 specifies that no match exceeding
290             the first STOP characters of the concatenation of the strings should be
291             concerned.
292              
293             If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
294             and all groups is stored in REGS. (For the "_2" variants, the offsets are
295             computed relative to the concatenation, not relative to the individual
296             strings.)
297              
298             On success, re_match* functions return the length of the match, re_search*
299             return the position of the start of the match. Return value -1 means no
300             match was found and -2 indicates an internal error. */
301              
302             regoff_t
303 0           re_match (pTHX_ struct re_pattern_buffer *bufp, const char *string, Idx length, SV *sv, Idx start, struct re_registers *regs)
304             {
305 0           return re_search_stub (aTHX_ bufp, string, length, sv, start, 0, length, regs, true);
306             }
307             #ifdef _LIBC
308             weak_alias (__re_match, re_match)
309             #endif
310              
311             regoff_t
312 14           re_search (pTHX_ struct re_pattern_buffer *bufp, const char *string, Idx length, SV *sv, Idx start, regoff_t range, struct re_registers *regs)
313             {
314 14           return re_search_stub (aTHX_ bufp, string, length, sv, start, range, length, regs,
315             false);
316             }
317             #ifdef _LIBC
318             weak_alias (__re_search, re_search)
319             #endif
320              
321             regoff_t
322 0           re_match_2 (pTHX_ struct re_pattern_buffer *bufp, const char *string1, Idx length1, SV *sv1, const char *string2, Idx length2, SV *sv2, Idx start, struct re_registers *regs, Idx stop)
323             {
324 0           return re_search_2_stub (aTHX_ bufp, string1, length1, sv1, string2, length2, sv2,
325             start, 0, regs, stop, true);
326             }
327             #ifdef _LIBC
328             weak_alias (__re_match_2, re_match_2)
329             #endif
330              
331             regoff_t
332 0           re_search_2 (pTHX_ struct re_pattern_buffer *bufp, const char *string1, Idx length1, SV *sv1, const char *string2, Idx length2, SV *sv2, Idx start, regoff_t range, struct re_registers *regs, Idx stop)
333             {
334 0           return re_search_2_stub (aTHX_ bufp, string1, length1, sv1, string2, length2, sv2,
335             start, range, regs, stop, false);
336             }
337             #ifdef _LIBC
338             weak_alias (__re_search_2, re_search_2)
339             #endif
340              
341             static regoff_t
342 0           re_search_2_stub (pTHX_ struct re_pattern_buffer *bufp,
343             const char *string1, Idx length1, SV *sv1,
344             const char *string2, Idx length2, SV *sv2,
345             Idx start, regoff_t range, struct re_registers *regs,
346             Idx stop, bool ret_len)
347             {
348             const char *str;
349             regoff_t rval;
350 0           Idx len = length1 + length2;
351 0           char *s = NULL;
352              
353             /* Although this is an extension from us, never used, we say that utf8 behaviour of the two strings
354             must be the same */
355 0 0         if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1 || DO_UTF8(sv1) != DO_UTF8(sv2), 0))
    0          
    0          
    0          
    0          
    0          
356 0           return -2;
357              
358             /* Concatenate the strings. */
359 0 0         if (length2 > 0)
360 0 0         if (length1 > 0)
361             {
362 0           re_malloc (s ,char, len);
363             #ifdef _LIBC
364             CANNOT BE REACHED: memcpy (__mempcpy (s, string1, length1), string2, length2);
365             #else
366 0           Copy (string1, s, length1, char);
367 0           Copy (string2, s + length1, length2, char);
368             #endif
369 0           str = s;
370             }
371             else
372 0           str = string2;
373             else
374 0           str = string1;
375              
376             {
377 0           SV *sv_tmp = newSVsv(sv1);
378 0           sv_catsv_mg(sv_tmp, sv2);
379 0           rval = re_search_stub (aTHX_ bufp, str, len, sv_tmp, start, range, stop, regs,
380             ret_len);
381 0           SvREFCNT_dec(sv_tmp);
382             }
383 0 0         re_free (s);
384 0           return rval;
385             }
386              
387             /* The parameters have the same meaning as those of re_search.
388             Additional parameters:
389             If RET_LEN is true the length of the match is returned (re_match style);
390             otherwise the position of the match is returned. */
391              
392             static regoff_t
393 14           re_search_stub (pTHX_ struct re_pattern_buffer *bufp,
394             const char *string, Idx length, SV *sv,
395             Idx start, regoff_t range, Idx stop, struct re_registers *regs,
396             bool ret_len)
397             {
398             reg_errcode_t result;
399             regmatch_t *pmatch;
400             Idx nregs;
401             regoff_t rval;
402 14           int eflags = 0;
403 14           re_dfa_t *dfa = bufp->buffer;
404 14           Idx last_start = start + range;
405              
406             /* Check for out-of-range. */
407 14 50         if (BE (start < 0 || start > length, 0))
408 0           return -1;
409 14 50         if (BE (length < last_start || (0 <= range && last_start < start), 0))
    50          
    50          
    50          
410 0           last_start = length;
411 14 50         else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
    0          
412 0           last_start = 0;
413              
414             lock_lock (dfa->lock);
415              
416 14           eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
417 14 50         eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
418              
419             /* Compile fastmap if we haven't yet. */
420 14 50         if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
    50          
    0          
421 0           re_compile_fastmap (aTHX_ bufp);
422              
423 14 50         if (BE (bufp->no_sub, 0))
424 0           regs = NULL;
425              
426             /* We need at least 1 register. */
427 14 50         if (regs == NULL)
428 0           nregs = 1;
429 14 50         else if (BE (bufp->regs_allocated == REGS_FIXED
    0          
430             && regs->num_regs <= bufp->re_nsub, 0))
431             {
432 0           nregs = regs->num_regs;
433 0 0         if (BE (nregs < 1, 0))
434             {
435             /* Nothing can be copied to regs. */
436 0           regs = NULL;
437 0           nregs = 1;
438             }
439             }
440             else
441 14           nregs = bufp->re_nsub + 1;
442              
443 14 50         re_malloc (pmatch, regmatch_t, nregs);
444 14           result = re_search_internal (aTHX_ bufp, string, length, sv, start, last_start, stop,
445             nregs, pmatch, eflags);
446              
447 14           rval = 0;
448              
449             /* I hope we needn't fill their regs with -1's when no match was found. */
450 14 100         if (result != REG_NOERROR)
451 2 50         rval = result == REG_NOMATCH ? -1 : -2;
452 12 50         else if (regs != NULL)
453             {
454             /* If caller wants register contents data back, copy them. */
455 12           bufp->regs_allocated = re_copy_regs (aTHX_ regs, pmatch, nregs,
456 12           bufp->regs_allocated);
457 12 50         if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
458 0           rval = -2;
459             }
460              
461 14 100         if (BE (rval == 0, 1))
462             {
463 12 50         if (ret_len)
464             {
465             assert (pmatch[0].rm_so == start);
466 0           rval = pmatch[0].rm_eo - start;
467             }
468             else
469 12           rval = pmatch[0].rm_so;
470             }
471 14 50         re_free (pmatch);
472             out:
473             lock_unlock (dfa->lock);
474 14           return rval;
475             }
476              
477             static unsigned
478 12           re_copy_regs (pTHX_ struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
479             int regs_allocated)
480             {
481 12           int rval = REGS_REALLOCATE;
482             Idx i;
483 12           Idx need_regs = nregs + 1;
484             /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
485             uses. */
486              
487             /* Have the register data arrays been allocated? */
488 12 100         if (regs_allocated == REGS_UNALLOCATED)
489             { /* No. So allocate them with malloc. */
490 5 50         re_malloc (regs->start, regoff_t, need_regs);
491 5 50         re_malloc (regs->end, regoff_t, need_regs);
492 5           regs->num_regs = need_regs;
493             }
494 7 50         else if (regs_allocated == REGS_REALLOCATE)
495             { /* Yes. If we need more elements than were already
496             allocated, reallocate them. If we need fewer, just
497             leave it alone. */
498 7 50         if (BE (need_regs > regs->num_regs, 0))
499             {
500 7 50         re_realloc (regs->start, regoff_t, need_regs);
501 7 50         re_realloc (regs->end, regoff_t, need_regs);
502 7           regs->num_regs = need_regs;
503             }
504             }
505             else
506             {
507             assert (regs_allocated == REGS_FIXED);
508             /* This function may not be called with REGS_FIXED and nregs too big. */
509             assert (regs->num_regs >= nregs);
510 0           rval = REGS_FIXED;
511             }
512              
513             /* Copy the regs. */
514 37 100         for (i = 0; i < nregs; ++i)
515             {
516 25           regs->start[i] = pmatch[i].rm_so;
517 25           regs->end[i] = pmatch[i].rm_eo;
518             }
519 24 100         for ( ; i < regs->num_regs; ++i)
520 12           regs->start[i] = regs->end[i] = -1;
521              
522 12           return rval;
523             }
524              
525             /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
526             ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
527             this memory for recording register information. STARTS and ENDS
528             must be allocated using the malloc library routine, and must each
529             be at least NUM_REGS * sizeof (regoff_t) bytes long.
530              
531             If NUM_REGS == 0, then subsequent matches should allocate their own
532             register data.
533              
534             Unless this function is called, the first search or match using
535             PATTERN_BUFFER will allocate its own register data, without
536             freeing the old data. */
537              
538             void
539 0           re_set_registers (pTHX_ struct re_pattern_buffer *bufp, struct re_registers *regs, __re_size_t num_regs, regoff_t *starts, regoff_t * ends)
540             {
541 0 0         if (num_regs)
542             {
543 0           bufp->regs_allocated = REGS_REALLOCATE;
544 0           regs->num_regs = num_regs;
545 0           regs->start = starts;
546 0           regs->end = ends;
547             }
548             else
549             {
550 0           bufp->regs_allocated = REGS_UNALLOCATED;
551 0           regs->num_regs = 0;
552 0           regs->start = regs->end = NULL;
553             }
554 0           }
555             #ifdef _LIBC
556             weak_alias (__re_set_registers, re_set_registers)
557             #endif
558            
559             /* Entry points compatible with 4.2 BSD regex library. We don't define
560             them unless specifically requested. */
561              
562             #if defined _REGEX_RE_COMP || defined _LIBC
563             int
564             # ifdef _LIBC
565             weak_function
566             # endif
567             re_exec (s)
568             const char *s;
569             {
570             return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
571             }
572             #endif /* _REGEX_RE_COMP */
573            
574             /* Internal entry point. */
575              
576             /* Searches for a compiled pattern PREG in the string STRING, whose
577             length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
578             meaning as with regexec. LAST_START is START + RANGE, where
579             START and RANGE have the same meaning as with re_search.
580             Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
581             otherwise return the error code.
582             Note: We assume front end functions already check ranges.
583             (0 <= LAST_START && LAST_START <= LENGTH) */
584              
585             static reg_errcode_t
586             __attribute_warn_unused_result__
587 14           re_search_internal (pTHX_ const regex_t *preg,
588             const char *string, Idx length, SV *sv,
589             Idx start, Idx last_start, Idx stop,
590             size_t nmatch, regmatch_t pmatch[],
591             int eflags)
592             {
593             reg_errcode_t err;
594 14           const re_dfa_t *dfa = preg->buffer;
595             Idx left_lim, right_lim;
596             int incr;
597             bool fl_longest_match;
598             int match_kind;
599             Idx match_first;
600 14           Idx match_last = REG_MISSING;
601             Idx extra_nmatch;
602             bool sb;
603             int ch;
604             #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
605 14           re_match_context_t mctx = { .dfa = dfa };
606             #else
607             re_match_context_t mctx;
608             #endif
609 0 0         char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
610 0 0         && start != last_start && !preg->can_be_null)
    0          
611 14 50         ? preg->fastmap : NULL);
612 14           RE_TRANSLATE_TYPE t = preg->translate;
613              
614             #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
615             memset (&mctx, '\0', sizeof (re_match_context_t));
616             mctx.dfa = dfa;
617             #endif
618             #ifdef _PERL_I18N
619 14           mctx.sv = sv;
620             #endif
621              
622 14 50         extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
623 14           nmatch -= extra_nmatch;
624              
625             /* Check if the DFA haven't been compiled. */
626 14 50         if (BE (preg->used == 0 || dfa->init_state == NULL
    50          
    50          
    50          
    50          
    50          
    50          
    50          
627             || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
628             || dfa->init_state_begbuf == NULL, 0))
629 0           return REG_NOMATCH;
630              
631             #ifdef DEBUG
632             /* We assume front-end functions already check them. */
633             assert (0 <= last_start && last_start <= length);
634             #endif
635              
636             /* If initial states with non-begbuf contexts have no elements,
637             the regex must be anchored. If preg->newline_anchor is set,
638             we'll never use init_state_nl, so do not check it. */
639 14 50         if (dfa->init_state->nodes.nelem == 0
640 0 0         && dfa->init_state_word->nodes.nelem == 0
641 0 0         && (dfa->init_state_nl->nodes.nelem == 0
642 0 0         || !preg->newline_anchor))
643             {
644 0 0         if (start != 0 && last_start != 0)
    0          
645 0           return REG_NOMATCH;
646 0           start = last_start = 0;
647             }
648              
649             /* We must check the longest matching, if nmatch > 0. */
650 14 50         fl_longest_match = (nmatch != 0 || dfa->nbackref);
    0          
651              
652 14           err = re_string_allocate (aTHX_ &mctx.input, string, length, /* c.f. comment in mbrtowc */ /* dfa->nodes_len + 1 */ length + 1,
653 14           preg->translate, (preg->syntax & RE_ICASE) != 0,
654             dfa);
655 14 50         if (BE (err != REG_NOERROR, 0))
656 0           goto free_return;
657 14           mctx.input.stop = stop;
658 14           mctx.input.raw_stop = stop;
659 14           mctx.input.newline_anchor = preg->newline_anchor;
660              
661 14           err = match_ctx_init (aTHX_ &mctx, eflags, dfa->nbackref * 2);
662 14 50         if (BE (err != REG_NOERROR, 0))
663 0           goto free_return;
664              
665             /* We will log all the DFA states through which the dfa pass,
666             if nmatch > 1, or this dfa has "multibyte node", which is a
667             back-reference or a node which can accept multibyte character or
668             multi character collating element. */
669 14 100         if (nmatch > 1 || dfa->has_mb_node)
    50          
670             {
671             /* Avoid overflow. */
672 13 50         if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
673             <= mctx.input.bufs_len), 0))
674             {
675 0           err = REG_ESPACE;
676 0           goto free_return;
677             }
678              
679 13 50         re_malloc (mctx.state_log, re_dfastate_t *, mctx.input.bufs_len + 1);
680             }
681             else
682 1           mctx.state_log = NULL;
683              
684 14           match_first = start;
685 14 50         mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
686             : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
687              
688             /* Check incrementally whether the input string matches. */
689 14 50         incr = (last_start < start) ? -1 : 1;
690 14           left_lim = (last_start < start) ? last_start : start;
691 14           right_lim = (last_start < start) ? start : last_start;
692 14           sb = dfa->mb_cur_max == 1;
693 14           match_kind =
694             (fastmap
695 0 0         ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
    0          
    0          
696 0 0         | (start <= last_start ? 2 : 0)
697 0           | (t != NULL ? 1 : 0))
698 14 50         : 8);
699              
700 39           for (;; match_first += incr)
701             {
702 53           err = REG_NOMATCH;
703 53 50         if (match_first < left_lim || right_lim < match_first)
    100          
704             goto free_return;
705              
706             /* Advance as rapidly as possible through the string, until we
707             find a plausible place to start matching. This may be done
708             with varying efficiency, so there are various possibilities:
709             only the most common of them are specialized, in order to
710             save on code size. We use a switch statement for speed. */
711 51           switch (match_kind)
712             {
713             case 8:
714             /* No fastmap. */
715 51           break;
716              
717             case 7:
718             /* Fastmap with single-byte translation, match forward. */
719 0 0         while (BE (match_first < right_lim, 1)
720 0 0         && !fastmap[t[(unsigned char) string[match_first]]])
721 0           ++match_first;
722 0           goto forward_match_found_start_or_reached_end;
723              
724             case 6:
725             /* Fastmap without translation, match forward. */
726 0 0         while (BE (match_first < right_lim, 1)
727 0 0         && !fastmap[(unsigned char) string[match_first]])
728 0           ++match_first;
729              
730             forward_match_found_start_or_reached_end:
731 0 0         if (BE (match_first == right_lim, 0))
732             {
733 0           ch = match_first >= length
734 0 0         ? 0 : (unsigned char) string[match_first];
735 0 0         if (!fastmap[t ? t[ch] : ch])
    0          
736 0           goto free_return;
737             }
738 0           break;
739              
740             case 4:
741             case 5:
742             /* Fastmap without multi-byte translation, match backwards. */
743 0 0         while (match_first >= left_lim)
744             {
745 0           ch = match_first >= length
746 0 0         ? 0 : (unsigned char) string[match_first];
747 0 0         if (fastmap[t ? t[ch] : ch])
    0          
748 0           break;
749 0           --match_first;
750             }
751 0 0         if (match_first < left_lim)
752 0           goto free_return;
753 0           break;
754              
755             default:
756             /* In this case, we can't determine easily the current byte,
757             since it might be a component byte of a multibyte
758             character. Then we use the constructed buffer instead. */
759             for (;;)
760             {
761             /* If MATCH_FIRST is out of the valid range, reconstruct the
762             buffers. */
763 0           __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
764 0 0         if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
765             {
766 0           err = re_string_reconstruct (aTHX_ &mctx.input, match_first,
767             eflags);
768 0 0         if (BE (err != REG_NOERROR, 0))
769 0           goto free_return;
770              
771 0           offset = match_first - mctx.input.raw_mbs_idx;
772             }
773             /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
774             Note that MATCH_FIRST must not be smaller than 0. */
775 0           ch = (match_first >= length
776 0 0         ? 0 : re_string_byte_at (&mctx.input, offset));
777 0 0         if (fastmap[ch])
778 0           break;
779 0           match_first += incr;
780 0 0         if (match_first < left_lim || match_first > right_lim)
    0          
781             {
782 0           err = REG_NOMATCH;
783 0           goto free_return;
784             }
785 0           }
786 0           break;
787             }
788              
789             /* Reconstruct the buffers so that the matcher can assume that
790             the matching starts from the beginning of the buffer. */
791 51           err = re_string_reconstruct (aTHX_ &mctx.input, match_first, eflags);
792 51 50         if (BE (err != REG_NOERROR, 0))
793 0           goto free_return;
794              
795             #ifdef RE_ENABLE_I18N
796             /* Don't consider this char as a possible match start if it part,
797             yet isn't the head, of a multibyte character. */
798 51 50         if (!sb && !re_string_first_byte (&mctx.input, 0))
    100          
    100          
799 16           continue;
800             #endif
801              
802             /* It seems to be appropriate one, then use the matcher. */
803             /* We assume that the matching starts from 0. */
804 35           mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
805 35 50         match_last = check_matching (aTHX_ &mctx, fl_longest_match,
806             start <= last_start ? &match_first : NULL);
807 35 100         if (match_last != REG_MISSING)
808             {
809 12 50         if (BE (match_last == REG_ERROR, 0))
810             {
811 0           err = REG_ESPACE;
812 0           goto free_return;
813             }
814             else
815             {
816 12           mctx.match_last = match_last;
817 12 50         if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
    50          
    0          
818             {
819 12           re_dfastate_t *pstate = mctx.state_log[match_last];
820 12           mctx.last_node = check_halt_state_context (aTHX_ &mctx, pstate,
821             match_last);
822             }
823 12 50         if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
    50          
    100          
824 3 50         || dfa->nbackref)
825             {
826 9           err = prune_impossible_nodes (aTHX_ &mctx);
827 9 50         if (err == REG_NOERROR)
828 9           break;
829 0 0         if (BE (err != REG_NOMATCH, 0))
830 0           goto free_return;
831 0           match_last = REG_MISSING;
832             }
833             else
834             break; /* We found a match. */
835             }
836             }
837              
838 23           match_ctx_clean (aTHX_ &mctx);
839 39           }
840              
841             #ifdef DEBUG
842             assert (match_last != REG_MISSING);
843             assert (err == REG_NOERROR);
844             #endif
845              
846             /* Set pmatch[] if we need. */
847 12 50         if (nmatch > 0)
848             {
849             Idx reg_idx;
850              
851             /* Initialize registers. */
852 25 100         for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
853 13           pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
854              
855             /* Set the points where matching start/end. */
856 12           pmatch[0].rm_so = 0;
857 12           pmatch[0].rm_eo = mctx.match_last;
858             /* FIXME: This function should fail if mctx.match_last exceeds
859             the maximum possible regoff_t value. We need a new error
860             code REG_OVERFLOW. */
861              
862 12 50         if (!preg->no_sub && nmatch > 1)
    50          
863             {
864 12           err = set_regs (aTHX_ preg, &mctx, nmatch, pmatch,
865 12 100         dfa->has_plural_match && dfa->nbackref > 0);
    50          
866 12 50         if (BE (err != REG_NOERROR, 0))
867 0           goto free_return;
868             }
869              
870             /* At last, add the offset to each register, since we slid
871             the buffers so that we could assume that the matching starts
872             from 0. */
873 37 100         for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
874 25 50         if (pmatch[reg_idx].rm_so != -1)
875             {
876             #ifdef RE_ENABLE_I18N
877 25 50         if (BE (mctx.input.offsets_needed != 0, 0))
878             {
879 0 0         pmatch[reg_idx].rm_so =
880 0           (pmatch[reg_idx].rm_so == mctx.input.valid_len
881 0           ? mctx.input.valid_raw_len
882 0           : mctx.input.offsets[pmatch[reg_idx].rm_so]);
883 0 0         pmatch[reg_idx].rm_eo =
884 0           (pmatch[reg_idx].rm_eo == mctx.input.valid_len
885 0           ? mctx.input.valid_raw_len
886 0           : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
887             }
888             #else
889             assert (mctx.input.offsets_needed == 0);
890             #endif
891 25           pmatch[reg_idx].rm_so += match_first;
892 25           pmatch[reg_idx].rm_eo += match_first;
893             }
894 12 50         for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
895             {
896 0           pmatch[nmatch + reg_idx].rm_so = -1;
897 0           pmatch[nmatch + reg_idx].rm_eo = -1;
898             }
899              
900 12 50         if (dfa->subexp_map)
901 0 0         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
902 0 0         if (dfa->subexp_map[reg_idx] != reg_idx)
903             {
904 0           pmatch[reg_idx + 1].rm_so
905 0           = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
906 0           pmatch[reg_idx + 1].rm_eo
907 0           = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
908             }
909             }
910              
911             free_return:
912 14 100         re_free (mctx.state_log);
913 14 50         if (dfa->nbackref)
914 0           match_ctx_free (aTHX_ &mctx);
915 14           re_string_destruct (aTHX_ &mctx.input);
916 14           return err;
917             }
918              
919             static reg_errcode_t
920             __attribute_warn_unused_result__
921 9           prune_impossible_nodes (pTHX_ re_match_context_t *mctx)
922             {
923 9           const re_dfa_t *const dfa = mctx->dfa;
924             Idx halt_node, match_last;
925             reg_errcode_t ret;
926             re_dfastate_t **sifted_states;
927 9           re_dfastate_t **lim_states = NULL;
928             re_sift_context_t sctx;
929             #ifdef DEBUG
930             assert (mctx->state_log != NULL);
931             #endif
932 9           match_last = mctx->match_last;
933 9           halt_node = mctx->last_node;
934              
935             /* Avoid overflow. */
936 9 50         if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
937 0           return REG_ESPACE;
938              
939 9 50         re_malloc (sifted_states, re_dfastate_t *, match_last + 1);
940 9 50         if (dfa->nbackref)
941             {
942 0 0         re_malloc (lim_states, re_dfastate_t *, match_last + 1);
943             while (1)
944             {
945 0           memset (lim_states, '\0',
946             sizeof (re_dfastate_t *) * (match_last + 1));
947 0           sift_ctx_init (aTHX_ &sctx, sifted_states, lim_states, halt_node,
948             match_last);
949 0           ret = sift_states_backward (aTHX_ mctx, &sctx);
950 0 0         re_node_set_free (&sctx.limits);
951 0 0         if (BE (ret != REG_NOERROR, 0))
952 0           goto free_return;
953 0 0         if (sifted_states[0] != NULL || lim_states[0] != NULL)
    0          
954             break;
955             do
956             {
957 0           --match_last;
958 0 0         if (! REG_VALID_INDEX (match_last))
959             {
960 0           ret = REG_NOMATCH;
961 0           goto free_return;
962             }
963 0           } while (mctx->state_log[match_last] == NULL
964 0 0         || !mctx->state_log[match_last]->halt);
    0          
965 0           halt_node = check_halt_state_context (aTHX_ mctx,
966 0           mctx->state_log[match_last],
967             match_last);
968 0           }
969 0           ret = merge_state_array (aTHX_ dfa, sifted_states, lim_states,
970             match_last + 1);
971 0 0         re_free (lim_states);
972 0           lim_states = NULL;
973 0 0         if (BE (ret != REG_NOERROR, 0))
974 0           goto free_return;
975             }
976             else
977             {
978 9           sift_ctx_init (aTHX_ &sctx, sifted_states, lim_states, halt_node, match_last);
979 9           ret = sift_states_backward (aTHX_ mctx, &sctx);
980 9 50         re_node_set_free (&sctx.limits);
981 9 50         if (BE (ret != REG_NOERROR, 0))
982 0           goto free_return;
983 9 50         if (sifted_states[0] == NULL)
984             {
985 0           ret = REG_NOMATCH;
986 0           goto free_return;
987             }
988             }
989 9 50         re_free (mctx->state_log);
990 9           mctx->state_log = sifted_states;
991 9           sifted_states = NULL;
992 9           mctx->last_node = halt_node;
993 9           mctx->match_last = match_last;
994 9           ret = REG_NOERROR;
995             free_return:
996 9 50         re_free (sifted_states);
997 9 50         re_free (lim_states);
998 9           return ret;
999             }
1000              
1001             /* Acquire an initial state and return it.
1002             We must select appropriate initial state depending on the context,
1003             since initial states may have constraints like "\<", "^", etc.. */
1004              
1005             static inline re_dfastate_t *
1006             __attribute__ ((always_inline)) internal_function
1007             acquire_init_state_context (pTHX_ reg_errcode_t *err, const re_match_context_t *mctx,
1008             Idx idx)
1009             {
1010 35           const re_dfa_t *const dfa = mctx->dfa;
1011 35 50         if (dfa->init_state->has_constraint)
1012             {
1013             unsigned int context;
1014 0           context = re_string_context_at (aTHX_ &mctx->input, idx - 1, mctx->eflags);
1015 0 0         if (IS_WORD_CONTEXT (context))
1016 0           return dfa->init_state_word;
1017 0 0         else if (IS_ORDINARY_CONTEXT (context))
1018 0           return dfa->init_state;
1019 0 0         else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
    0          
1020 0           return dfa->init_state_begbuf;
1021 0 0         else if (IS_NEWLINE_CONTEXT (context))
1022 0           return dfa->init_state_nl;
1023 0 0         else if (IS_BEGBUF_CONTEXT (context))
1024             {
1025             /* It is relatively rare case, then calculate on demand. */
1026 0           return re_acquire_state_context (aTHX_ err, dfa,
1027 0           dfa->init_state->entrance_nodes,
1028             context);
1029             }
1030             else
1031             /* Must not happen? */
1032 0           return dfa->init_state;
1033             }
1034             else
1035 35           return dfa->init_state;
1036             }
1037              
1038             /* Check whether the regular expression match input string INPUT or not,
1039             and return the index where the matching end. Return REG_MISSING if
1040             there is no match, and return REG_ERROR in case of an error.
1041             FL_LONGEST_MATCH means we want the POSIX longest matching.
1042             If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1043             next place where we may want to try matching.
1044             Note that the matcher assumes that the matching starts from the current
1045             index of the buffer. */
1046              
1047             static Idx
1048             internal_function __attribute_warn_unused_result__
1049 35           check_matching (pTHX_ re_match_context_t *mctx, bool fl_longest_match,
1050             Idx *p_match_first)
1051             {
1052 35           const re_dfa_t *const dfa = mctx->dfa;
1053             reg_errcode_t err;
1054 35           Idx match = 0;
1055 35           Idx match_last = REG_MISSING;
1056 35           Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1057             re_dfastate_t *cur_state;
1058 35           bool at_init_state = p_match_first != NULL;
1059 35           Idx next_start_idx = cur_str_idx;
1060              
1061 35           err = REG_NOERROR;
1062 35           cur_state = acquire_init_state_context (aTHX_ &err, mctx, cur_str_idx);
1063             /* An initial state must not be NULL (invalid). */
1064 35 50         if (BE (cur_state == NULL, 0))
1065             {
1066             assert (err == REG_ESPACE);
1067 0           return REG_ERROR;
1068             }
1069              
1070 35 100         if (mctx->state_log != NULL)
1071             {
1072 21           mctx->state_log[cur_str_idx] = cur_state;
1073              
1074             /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1075             later. E.g. Processing back references. */
1076 21 50         if (BE (dfa->nbackref, 0))
1077             {
1078 0           at_init_state = false;
1079 0           err = check_subexp_matching_top (aTHX_ mctx, &cur_state->nodes, 0);
1080 0 0         if (BE (err != REG_NOERROR, 0))
1081 0           return err;
1082              
1083 0 0         if (cur_state->has_backref)
1084             {
1085 0           err = transit_state_bkref (aTHX_ mctx, &cur_state->nodes);
1086 0 0         if (BE (err != REG_NOERROR, 0))
1087 0           return err;
1088             }
1089             }
1090             }
1091              
1092             /* If the RE accepts NULL string. */
1093 35 50         if (BE (cur_state->halt, 0))
1094             {
1095 0 0         if (!cur_state->has_constraint
1096 0 0         || check_halt_state_context (aTHX_ mctx, cur_state, cur_str_idx))
1097             {
1098 0 0         if (!fl_longest_match)
1099 0           return cur_str_idx;
1100             else
1101             {
1102 0           match_last = cur_str_idx;
1103 0           match = 1;
1104             }
1105             }
1106             }
1107              
1108 66 100         while (!re_string_eoi (&mctx->input))
1109             {
1110 61           re_dfastate_t *old_state = cur_state;
1111 61           Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1112              
1113 61 50         if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1114 0 0         && mctx->input.bufs_len < mctx->input.len)
1115 61 100         || (BE (next_char_idx >= mctx->input.valid_len, 0)
1116 7 50         && mctx->input.valid_len < mctx->input.len))
1117             {
1118 0           err = extend_buffers (aTHX_ mctx, next_char_idx + 1);
1119 0 0         if (BE (err != REG_NOERROR, 0))
1120             {
1121             assert (err == REG_ESPACE);
1122 0           return REG_ERROR;
1123             }
1124             }
1125              
1126 61           cur_state = transit_state (aTHX_ &err, mctx, cur_state);
1127 61 100         if (mctx->state_log != NULL)
1128 46           cur_state = merge_state_with_log (aTHX_ &err, mctx, cur_state);
1129              
1130 61 100         if (cur_state == NULL)
1131             {
1132             /* Reached the invalid state or an error. Try to recover a valid
1133             state using the state log, if available and if we have not
1134             already found a valid (even if not the longest) match. */
1135 46 50         if (BE (err != REG_NOERROR, 0))
1136 0           return REG_ERROR;
1137              
1138 46 100         if (mctx->state_log == NULL
1139 33 100         || (match && !fl_longest_match)
    50          
1140 33 100         || (cur_state = find_recover_state (aTHX_ &err, mctx)) == NULL)
1141             break;
1142             }
1143              
1144 31 100         if (BE (at_init_state, 0))
1145             {
1146 21 50         if (old_state == cur_state)
1147 0           next_start_idx = next_char_idx;
1148             else
1149 21           at_init_state = false;
1150             }
1151              
1152 31 100         if (cur_state->halt)
1153             {
1154             /* Reached a halt state.
1155             Check the halt state can satisfy the current context. */
1156 12 50         if (!cur_state->has_constraint
1157 0 0         || check_halt_state_context (aTHX_ mctx, cur_state,
1158             re_string_cur_idx (&mctx->input)))
1159             {
1160             /* We found an appropriate halt state. */
1161 12           match_last = re_string_cur_idx (aTHX_ &mctx->input);
1162 12           match = 1;
1163              
1164             /* We found a match, do not modify match_first below. */
1165 12           p_match_first = NULL;
1166 12 50         if (!fl_longest_match)
1167 0           break;
1168             }
1169             }
1170             }
1171              
1172 35 100         if (p_match_first)
1173 23           *p_match_first += next_start_idx;
1174              
1175 35           return match_last;
1176             }
1177              
1178             /* Check NODE match the current context. */
1179              
1180             static bool
1181             internal_function
1182 21           check_halt_node_context (pTHX_ const re_dfa_t *dfa, Idx node, unsigned int context)
1183             {
1184 21           re_token_type_t type = dfa->nodes[node].type;
1185 21           unsigned int constraint = dfa->nodes[node].constraint;
1186 21 100         if (type != END_OF_RE)
1187 9           return false;
1188 12 50         if (!constraint)
1189 12           return true;
1190 0 0         if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
    0          
    0          
    0          
    0          
    0          
    0          
    0          
1191 0           return false;
1192 0           return true;
1193             }
1194              
1195             /* Check the halt state STATE match the current context.
1196             Return 0 if not match, if the node, STATE has, is a halt node and
1197             match the context, return the node. */
1198              
1199             static Idx
1200             internal_function
1201 12           check_halt_state_context (pTHX_ const re_match_context_t *mctx,
1202             const re_dfastate_t *state, Idx idx)
1203             {
1204             Idx i;
1205             unsigned int context;
1206             #ifdef DEBUG
1207             assert (state->halt);
1208             #endif
1209 12           context = re_string_context_at (aTHX_ &mctx->input, idx, mctx->eflags);
1210 21 50         for (i = 0; i < state->nodes.nelem; ++i)
1211 21 100         if (check_halt_node_context (aTHX_ mctx->dfa, state->nodes.elems[i], context))
1212 12           return state->nodes.elems[i];
1213 0           return 0;
1214             }
1215              
1216             /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1217             corresponding to the DFA).
1218             Return the destination node, and update EPS_VIA_NODES;
1219             return REG_MISSING in case of errors. */
1220              
1221             static Idx
1222             internal_function
1223 57           proceed_next_node (pTHX_ const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1224             Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1225             struct re_fail_stack_t *fs)
1226             {
1227 57           const re_dfa_t *const dfa = mctx->dfa;
1228             Idx i;
1229             bool ok;
1230 57 100         if (IS_EPSILON_NODE (dfa->nodes[node].type))
1231             {
1232 35           re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1233 35           re_node_set *edests = &dfa->edests[node];
1234             Idx dest_node;
1235 35           ok = re_node_set_insert (aTHX_ eps_via_nodes, node);
1236 35 50         if (BE (! ok, 0))
1237 0           return REG_ERROR;
1238             /* Pick up a valid destination, or return REG_MISSING if none
1239             is found. */
1240 79 100         for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1241             {
1242 44           Idx candidate = edests->elems[i];
1243 44 100         if (!re_node_set_contains (aTHX_ cur_nodes, candidate))
1244 9           continue;
1245 35 50         if (dest_node == REG_MISSING)
1246 35           dest_node = candidate;
1247              
1248             else
1249             {
1250             /* In order to avoid infinite loop like "(a*)*", return the second
1251             epsilon-transition if the first was already considered. */
1252 0 0         if (re_node_set_contains (aTHX_ eps_via_nodes, dest_node))
1253 0           return candidate;
1254              
1255             /* Otherwise, push the second epsilon-transition on the fail stack. */
1256 0 0         else if (fs != NULL
1257 0 0         && push_fail_stack (aTHX_ fs, *pidx, candidate, nregs, regs,
1258             eps_via_nodes))
1259 0           return REG_ERROR;
1260              
1261             /* We know we are going to exit. */
1262 0           break;
1263             }
1264             }
1265 35           return dest_node;
1266             }
1267             else
1268             {
1269 22           Idx naccepted = 0;
1270 22           re_token_type_t type = dfa->nodes[node].type;
1271              
1272             #ifdef RE_ENABLE_I18N
1273 22 100         if (dfa->nodes[node].accept_mb)
1274 9           naccepted = check_node_accept_bytes (aTHX_ dfa, node, &mctx->input, mctx->sv, *pidx);
1275             else
1276             #endif /* RE_ENABLE_I18N */
1277 13 50         if (type == OP_BACK_REF)
1278             {
1279 0           Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1280 0           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1281 0 0         if (fs != NULL)
1282             {
1283 0 0         if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
    0          
1284 0           return REG_MISSING;
1285 0 0         else if (naccepted)
1286             {
1287 0           char *buf = (char *) re_string_get_buffer (aTHX_ &mctx->input);
1288 0 0         if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1289             naccepted) != 0)
1290 0           return REG_MISSING;
1291             }
1292             }
1293              
1294 0 0         if (naccepted == 0)
1295             {
1296             Idx dest_node;
1297 0           ok = re_node_set_insert (aTHX_ eps_via_nodes, node);
1298 0 0         if (BE (! ok, 0))
1299 0           return REG_ERROR;
1300 0           dest_node = dfa->edests[node].elems[0];
1301 0 0         if (re_node_set_contains (aTHX_ &mctx->state_log[*pidx]->nodes,
1302             dest_node))
1303 0           return dest_node;
1304             }
1305             }
1306              
1307 22 100         if (naccepted != 0
1308 13 50         || check_node_accept (aTHX_ mctx, dfa->nodes + node, *pidx))
1309             {
1310 22           Idx dest_node = dfa->nexts[node];
1311 22 100         *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1312 22 50         if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
    0          
    0          
1313 0 0         || !re_node_set_contains (aTHX_ &mctx->state_log[*pidx]->nodes,
1314             dest_node)))
1315 0           return REG_MISSING;
1316 22           re_node_set_empty (aTHX_ eps_via_nodes);
1317 22           return dest_node;
1318             }
1319             }
1320 0           return REG_MISSING;
1321             }
1322              
1323             static reg_errcode_t
1324             internal_function __attribute_warn_unused_result__
1325 0           push_fail_stack (pTHX_ struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1326             Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1327             {
1328             reg_errcode_t err;
1329 0           Idx num = fs->num++;
1330 0 0         if (fs->num == fs->alloc)
1331             {
1332             /* new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) * fs->alloc * 2)); */
1333 0 0         Renew(fs->stack, (sizeof (struct re_fail_stack_ent_t) * fs->alloc * 2), struct re_fail_stack_ent_t);
1334 0           fs->alloc *= 2;
1335             }
1336 0           fs->stack[num].idx = str_idx;
1337 0           fs->stack[num].node = dest_node;
1338 0 0         re_malloc (fs->stack[num].regs, regmatch_t, nregs);
1339 0 0         Copy (regs, fs->stack[num].regs, nregs, regmatch_t);
1340 0           err = re_node_set_init_copy (aTHX_ &fs->stack[num].eps_via_nodes, eps_via_nodes);
1341 0           return err;
1342             }
1343              
1344             static Idx
1345             internal_function
1346 0           pop_fail_stack (pTHX_ struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1347             regmatch_t *regs, re_node_set *eps_via_nodes)
1348             {
1349 0           Idx num = --fs->num;
1350             assert (REG_VALID_INDEX (num));
1351 0           *pidx = fs->stack[num].idx;
1352 0 0         Copy (fs->stack[num].regs, regs, nregs, regmatch_t);
1353 0 0         re_node_set_free (eps_via_nodes);
1354 0 0         re_free (fs->stack[num].regs);
1355 0           *eps_via_nodes = fs->stack[num].eps_via_nodes;
1356 0           return fs->stack[num].node;
1357             }
1358              
1359             /* Set the positions where the subexpressions are starts/ends to registers
1360             PMATCH.
1361             Note: We assume that pmatch[0] is already set, and
1362             pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1363              
1364             static reg_errcode_t
1365             internal_function __attribute_warn_unused_result__
1366 12           set_regs (pTHX_ const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1367             regmatch_t *pmatch, bool fl_backtrack)
1368             {
1369 12           const re_dfa_t *dfa = preg->buffer;
1370             Idx idx, cur_node;
1371             re_node_set eps_via_nodes;
1372             struct re_fail_stack_t *fs;
1373 12           struct re_fail_stack_t fs_body = { 0, 2, NULL };
1374             regmatch_t *prev_idx_match;
1375 12           bool prev_idx_match_malloced = false;
1376              
1377             #ifdef DEBUG
1378             assert (nmatch > 1);
1379             assert (mctx->state_log != NULL);
1380             #endif
1381 12 50         if (fl_backtrack)
1382             {
1383 0           fs = &fs_body;
1384 0 0         re_malloc (fs->stack, struct re_fail_stack_ent_t, fs->alloc);
1385             }
1386             else
1387 12           fs = NULL;
1388              
1389 12           cur_node = dfa->init_node;
1390 12           re_node_set_init_empty (&eps_via_nodes);
1391              
1392             if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1393             prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1394             else
1395             {
1396 12 50         re_malloc (prev_idx_match, regmatch_t, nmatch);
1397 12           prev_idx_match_malloced = true;
1398             }
1399 12 50         Copy (pmatch, prev_idx_match, nmatch, regmatch_t);
1400              
1401 69 50         for (idx = (Idx)pmatch[0].rm_so; idx <= (Idx)pmatch[0].rm_eo ;)
1402             {
1403 69           update_regs (aTHX_ dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1404              
1405 69 100         if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
    100          
1406             {
1407             Idx reg_idx;
1408 12 50         if (fs)
1409             {
1410 0 0         for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1411 0 0         if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
    0          
1412 0           break;
1413 0 0         if (reg_idx == nmatch)
1414             {
1415 0 0         re_node_set_free (&eps_via_nodes);
1416 0 0         if (prev_idx_match_malloced)
1417 0 0         re_free (prev_idx_match);
1418 0           return free_fail_stack_return (aTHX_ fs);
1419             }
1420 0           cur_node = pop_fail_stack (aTHX_ fs, &idx, nmatch, pmatch,
1421             &eps_via_nodes);
1422             }
1423             else
1424             {
1425 12 50         re_node_set_free (&eps_via_nodes);
1426 12 50         if (prev_idx_match_malloced)
1427 12 50         re_free (prev_idx_match);
1428 12           return REG_NOERROR;
1429             }
1430             }
1431              
1432             /* Proceed to next node. */
1433 57           cur_node = proceed_next_node (aTHX_ mctx, nmatch, pmatch, &idx, cur_node,
1434             &eps_via_nodes, fs);
1435              
1436 57 50         if (BE (! REG_VALID_INDEX (cur_node), 0))
1437             {
1438 0 0         if (BE (cur_node == REG_ERROR, 0))
1439             {
1440 0 0         re_node_set_free (&eps_via_nodes);
1441 0 0         if (prev_idx_match_malloced)
1442 0 0         re_free (prev_idx_match);
1443 0           free_fail_stack_return (aTHX_ fs);
1444 0           return REG_ESPACE;
1445             }
1446 0 0         if (fs)
1447 0           cur_node = pop_fail_stack (aTHX_ fs, &idx, nmatch, pmatch,
1448             &eps_via_nodes);
1449             else
1450             {
1451 0 0         re_node_set_free (&eps_via_nodes);
1452 0 0         if (prev_idx_match_malloced)
1453 0 0         re_free (prev_idx_match);
1454 0           return REG_NOMATCH;
1455             }
1456             }
1457             }
1458 0 0         re_node_set_free (&eps_via_nodes);
1459 0 0         if (prev_idx_match_malloced)
1460 0 0         re_free (prev_idx_match);
1461 12           return free_fail_stack_return (aTHX_ fs);
1462             }
1463              
1464             static reg_errcode_t
1465             internal_function
1466 0           free_fail_stack_return (pTHX_ struct re_fail_stack_t *fs)
1467             {
1468 0 0         if (fs)
1469             {
1470             Idx fs_idx;
1471 0 0         for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1472             {
1473 0 0         re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1474 0 0         re_free (fs->stack[fs_idx].regs);
1475             }
1476 0 0         re_free (fs->stack);
1477             }
1478 0           return REG_NOERROR;
1479             }
1480              
1481             static void
1482             internal_function
1483 69           update_regs (pTHX_ const re_dfa_t *dfa, regmatch_t *pmatch,
1484             regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1485             {
1486 69           int type = dfa->nodes[cur_node].type;
1487 69 100         if (type == OP_OPEN_SUBEXP)
1488             {
1489 13           Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1490              
1491             /* We are at the first node of this sub expression. */
1492 13 50         if (reg_num < nmatch)
1493             {
1494 13           pmatch[reg_num].rm_so = cur_idx;
1495 13           pmatch[reg_num].rm_eo = -1;
1496             }
1497             }
1498 56 100         else if (type == OP_CLOSE_SUBEXP)
1499             {
1500 13           Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1501 13 50         if (reg_num < nmatch)
1502             {
1503             /* We are at the last node of this sub expression. */
1504 13 50         if ((Idx)pmatch[reg_num].rm_so < cur_idx)
1505             {
1506 13           pmatch[reg_num].rm_eo = cur_idx;
1507             /* This is a non-empty match or we are not inside an optional
1508             subexpression. Accept this right away. */
1509 13 50         Copy (pmatch, prev_idx_match, nmatch, regmatch_t);
1510             }
1511             else
1512             {
1513 0 0         if (dfa->nodes[cur_node].opt_subexp
1514 0 0         && prev_idx_match[reg_num].rm_so != -1)
1515             /* We transited through an empty match for an optional
1516             subexpression, like (a?)*, and this is not the subexp's
1517             first match. Copy back the old content of the registers
1518             so that matches of an inner subexpression are undone as
1519             well, like in ((a?))*. */
1520 0 0         Copy (prev_idx_match, pmatch, nmatch, regmatch_t);
1521             else
1522             /* We completed a subexpression, but it may be part of
1523             an optional one, so do not update PREV_IDX_MATCH. */
1524 0           pmatch[reg_num].rm_eo = cur_idx;
1525             }
1526             }
1527             }
1528 69           }
1529              
1530             /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1531             and sift the nodes in each states according to the following rules.
1532             Updated state_log will be wrote to STATE_LOG.
1533              
1534             Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1535             1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1536             If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1537             the LAST_NODE, we throw away the node 'a'.
1538             2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1539             string 's' and transit to 'b':
1540             i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1541             away the node 'a'.
1542             ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1543             thrown away, we throw away the node 'a'.
1544             3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1545             i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1546             node 'a'.
1547             ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1548             we throw away the node 'a'. */
1549              
1550             #define STATE_NODE_CONTAINS(state,node) \
1551             ((state) != NULL && re_node_set_contains (aTHX_ &(state)->nodes, node))
1552              
1553             static reg_errcode_t
1554             internal_function
1555 9           sift_states_backward (pTHX_ const re_match_context_t *mctx, re_sift_context_t *sctx)
1556             {
1557             reg_errcode_t err;
1558 9           int null_cnt = 0;
1559 9           Idx str_idx = sctx->last_str_idx;
1560             re_node_set cur_dest;
1561              
1562             #ifdef DEBUG
1563             assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1564             #endif
1565              
1566             /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1567             transit to the last_node and the last_node itself. */
1568 9           err = re_node_set_init_1 (aTHX_ &cur_dest, sctx->last_node);
1569 9 50         if (BE (err != REG_NOERROR, 0))
1570 0           return err;
1571 9           err = update_cur_sifted_state (aTHX_ mctx, sctx, str_idx, &cur_dest);
1572 9 50         if (BE (err != REG_NOERROR, 0))
1573 0           goto free_return;
1574              
1575             /* Then check each states in the state_log. */
1576 37 100         while (str_idx > 0)
1577             {
1578             /* Update counters. */
1579 28 100         null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1580 28 50         if (null_cnt > mctx->max_mb_elem_len)
1581             {
1582 0           memset (sctx->sifted_states, '\0',
1583             sizeof (re_dfastate_t *) * str_idx);
1584 0 0         re_node_set_free (&cur_dest);
1585 0           return REG_NOERROR;
1586             }
1587 28           re_node_set_empty (aTHX_ &cur_dest);
1588 28           --str_idx;
1589              
1590 28 100         if (mctx->state_log[str_idx])
1591             {
1592 10           err = build_sifted_states (aTHX_ mctx, sctx, str_idx, &cur_dest);
1593 10 50         if (BE (err != REG_NOERROR, 0))
1594 0           goto free_return;
1595             }
1596              
1597             /* Add all the nodes which satisfy the following conditions:
1598             - It can epsilon transit to a node in CUR_DEST.
1599             - It is in CUR_SRC.
1600             And update state_log. */
1601 28           err = update_cur_sifted_state (aTHX_ mctx, sctx, str_idx, &cur_dest);
1602 28 50         if (BE (err != REG_NOERROR, 0))
1603 0           goto free_return;
1604             }
1605 9           err = REG_NOERROR;
1606             free_return:
1607 9 50         re_node_set_free (&cur_dest);
1608 9           return err;
1609             }
1610              
1611             static reg_errcode_t
1612             internal_function __attribute_warn_unused_result__
1613 10           build_sifted_states (pTHX_ const re_match_context_t *mctx, re_sift_context_t *sctx,
1614             Idx str_idx, re_node_set *cur_dest)
1615             {
1616 10           const re_dfa_t *const dfa = mctx->dfa;
1617 10           const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1618             Idx i;
1619              
1620             /* Then build the next sifted state.
1621             We build the next sifted state on 'cur_dest', and update
1622             'sifted_states[str_idx]' with 'cur_dest'.
1623             Note:
1624             'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1625             'cur_src' points the node_set of the old 'state_log[str_idx]'
1626             (with the epsilon nodes pre-filtered out). */
1627 29 100         for (i = 0; i < cur_src->nelem; i++)
1628             {
1629 19           Idx prev_node = cur_src->elems[i];
1630 19           int naccepted = 0;
1631             bool ok;
1632              
1633             #ifdef DEBUG
1634             re_token_type_t type = dfa->nodes[prev_node].type;
1635             assert (!IS_EPSILON_NODE (type));
1636             #endif
1637             #ifdef RE_ENABLE_I18N
1638             /* If the node may accept "multi byte". */
1639 19 100         if (dfa->nodes[prev_node].accept_mb)
1640 9           naccepted = sift_states_iter_mb (aTHX_ mctx, sctx, prev_node,
1641             str_idx, sctx->last_str_idx);
1642             #endif /* RE_ENABLE_I18N */
1643              
1644             /* We don't check backreferences here.
1645             See update_cur_sifted_state(). */
1646 19 100         if (!naccepted
1647 10 100         && check_node_accept (aTHX_ mctx, dfa->nodes + prev_node, str_idx)
1648 1 50         && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
    50          
1649             dfa->nexts[prev_node]))
1650 1           naccepted = 1;
1651              
1652 19 100         if (naccepted == 0)
1653 9           continue;
1654              
1655 10 50         if (sctx->limits.nelem)
1656             {
1657 0           Idx to_idx = str_idx + naccepted;
1658 0 0         if (check_dst_limits (aTHX_ mctx, &sctx->limits,
1659 0           dfa->nexts[prev_node], to_idx,
1660             prev_node, str_idx))
1661 0           continue;
1662             }
1663 10           ok = re_node_set_insert (aTHX_ cur_dest, prev_node);
1664 10 50         if (BE (! ok, 0))
1665 0           return REG_ESPACE;
1666             }
1667              
1668 10           return REG_NOERROR;
1669             }
1670              
1671             /* Helper functions. */
1672              
1673             static reg_errcode_t
1674             internal_function
1675 16           clean_state_log_if_needed (pTHX_ re_match_context_t *mctx, Idx next_state_log_idx)
1676             {
1677 16           Idx top = mctx->state_log_top;
1678              
1679 16 50         if ((next_state_log_idx >= mctx->input.bufs_len
1680 0 0         && mctx->input.bufs_len < mctx->input.len)
1681 16 50         || (next_state_log_idx >= mctx->input.valid_len
1682 0 0         && mctx->input.valid_len < mctx->input.len))
1683             {
1684             reg_errcode_t err;
1685 0           err = extend_buffers (aTHX_ mctx, next_state_log_idx + 1);
1686 0 0         if (BE (err != REG_NOERROR, 0))
1687 0           return err;
1688             }
1689              
1690 16 50         if (top < next_state_log_idx)
1691             {
1692 16           memset (mctx->state_log + top + 1, '\0',
1693 16           sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1694 16           mctx->state_log_top = next_state_log_idx;
1695             }
1696 16           return REG_NOERROR;
1697             }
1698              
1699             static reg_errcode_t
1700             internal_function
1701 0           merge_state_array (pTHX_ const re_dfa_t *dfa, re_dfastate_t **dst,
1702             re_dfastate_t **src, Idx num)
1703             {
1704             Idx st_idx;
1705             reg_errcode_t err;
1706 0 0         for (st_idx = 0; st_idx < num; ++st_idx)
1707             {
1708 0 0         if (dst[st_idx] == NULL)
1709 0           dst[st_idx] = src[st_idx];
1710 0 0         else if (src[st_idx] != NULL)
1711             {
1712             re_node_set merged_set;
1713 0           err = re_node_set_init_union (aTHX_ &merged_set, &dst[st_idx]->nodes,
1714 0           &src[st_idx]->nodes);
1715 0 0         if (BE (err != REG_NOERROR, 0))
1716 0           return err;
1717 0           dst[st_idx] = re_acquire_state (aTHX_ &err, dfa, &merged_set);
1718 0 0         re_node_set_free (&merged_set);
1719 0 0         if (BE (err != REG_NOERROR, 0))
1720 0           return err;
1721             }
1722             }
1723 0           return REG_NOERROR;
1724             }
1725              
1726             static reg_errcode_t
1727             internal_function
1728 37           update_cur_sifted_state (pTHX_ const re_match_context_t *mctx,
1729             re_sift_context_t *sctx, Idx str_idx,
1730             re_node_set *dest_nodes)
1731             {
1732 37           const re_dfa_t *const dfa = mctx->dfa;
1733 37           reg_errcode_t err = REG_NOERROR;
1734             const re_node_set *candidates;
1735 74           candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1736 37 100         : &mctx->state_log[str_idx]->nodes);
1737              
1738 37 100         if (dest_nodes->nelem == 0)
1739 18           sctx->sifted_states[str_idx] = NULL;
1740             else
1741             {
1742 19 50         if (candidates)
1743             {
1744             /* At first, add the nodes which can epsilon transit to a node in
1745             DEST_NODE. */
1746 19           err = add_epsilon_src_nodes (aTHX_ dfa, dest_nodes, candidates);
1747 19 50         if (BE (err != REG_NOERROR, 0))
1748 0           return err;
1749              
1750             /* Then, check the limitations in the current sift_context. */
1751 19 50         if (sctx->limits.nelem)
1752             {
1753 0           err = check_subexp_limits (aTHX_ dfa, dest_nodes, candidates, &sctx->limits,
1754             mctx->bkref_ents, str_idx);
1755 0 0         if (BE (err != REG_NOERROR, 0))
1756 0           return err;
1757             }
1758             }
1759              
1760 19           sctx->sifted_states[str_idx] = re_acquire_state (aTHX_ &err, dfa, dest_nodes);
1761 19 50         if (BE (err != REG_NOERROR, 0))
1762 0           return err;
1763             }
1764              
1765 37 100         if (candidates && mctx->state_log[str_idx]->has_backref)
    50          
1766             {
1767 0           err = sift_states_bkref (aTHX_ mctx, sctx, str_idx, candidates);
1768 0 0         if (BE (err != REG_NOERROR, 0))
1769 0           return err;
1770             }
1771 37           return REG_NOERROR;
1772             }
1773              
1774             static reg_errcode_t
1775             internal_function __attribute_warn_unused_result__
1776 19           add_epsilon_src_nodes (pTHX_ const re_dfa_t *dfa, re_node_set *dest_nodes,
1777             const re_node_set *candidates)
1778             {
1779 19           reg_errcode_t err = REG_NOERROR;
1780             Idx i;
1781              
1782 19           re_dfastate_t *state = re_acquire_state (aTHX_ &err, dfa, dest_nodes);
1783 19 50         if (BE (err != REG_NOERROR, 0))
1784 0           return err;
1785              
1786 19 100         if (!state->inveclosure.alloc)
1787             {
1788 5           err = re_node_set_alloc (aTHX_ &state->inveclosure, dest_nodes->nelem);
1789 5 50         if (BE (err != REG_NOERROR, 0))
1790 0           return REG_ESPACE;
1791 10 100         for (i = 0; i < dest_nodes->nelem; i++)
1792             {
1793 5           err = re_node_set_merge (aTHX_ &state->inveclosure,
1794 5           dfa->inveclosures + dest_nodes->elems[i]);
1795 5 50         if (BE (err != REG_NOERROR, 0))
1796 0           return REG_ESPACE;
1797             }
1798             }
1799 19           return re_node_set_add_intersect (aTHX_ dest_nodes, candidates,
1800 19           &state->inveclosure);
1801             }
1802              
1803             static reg_errcode_t
1804             internal_function
1805 0           sub_epsilon_src_nodes (pTHX_ const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1806             const re_node_set *candidates)
1807             {
1808             Idx ecl_idx;
1809             reg_errcode_t err;
1810 0           re_node_set *inv_eclosure = dfa->inveclosures + node;
1811             re_node_set except_nodes;
1812 0           re_node_set_init_empty (&except_nodes);
1813 0 0         for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1814             {
1815 0           Idx cur_node = inv_eclosure->elems[ecl_idx];
1816 0 0         if (cur_node == node)
1817 0           continue;
1818 0 0         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1819             {
1820 0           Idx edst1 = dfa->edests[cur_node].elems[0];
1821 0           Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1822 0 0         ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1823 0 0         if ((!re_node_set_contains (aTHX_ inv_eclosure, edst1)
1824 0 0         && re_node_set_contains (aTHX_ dest_nodes, edst1))
1825 0 0         || (REG_VALID_NONZERO_INDEX (edst2)
1826 0 0         && !re_node_set_contains (aTHX_ inv_eclosure, edst2)
1827 0 0         && re_node_set_contains (aTHX_ dest_nodes, edst2)))
1828             {
1829 0           err = re_node_set_add_intersect (aTHX_ &except_nodes, candidates,
1830 0           dfa->inveclosures + cur_node);
1831 0 0         if (BE (err != REG_NOERROR, 0))
1832             {
1833 0 0         re_node_set_free (&except_nodes);
1834 0           return err;
1835             }
1836             }
1837             }
1838             }
1839 0 0         for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1840             {
1841 0           Idx cur_node = inv_eclosure->elems[ecl_idx];
1842 0 0         if (!re_node_set_contains (aTHX_ &except_nodes, cur_node))
1843             {
1844 0           Idx idx = re_node_set_contains (aTHX_ dest_nodes, cur_node) - 1;
1845 0           re_node_set_remove_at (aTHX_ dest_nodes, idx);
1846             }
1847             }
1848 0 0         re_node_set_free (&except_nodes);
1849 0           return REG_NOERROR;
1850             }
1851              
1852             static bool
1853             internal_function
1854 0           check_dst_limits (pTHX_ const re_match_context_t *mctx, const re_node_set *limits,
1855             Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1856             {
1857 0           const re_dfa_t *const dfa = mctx->dfa;
1858             Idx lim_idx, src_pos, dst_pos;
1859              
1860 0           Idx dst_bkref_idx = search_cur_bkref_entry (aTHX_ mctx, dst_idx);
1861 0           Idx src_bkref_idx = search_cur_bkref_entry (aTHX_ mctx, src_idx);
1862 0 0         for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1863             {
1864             Idx subexp_idx;
1865             struct re_backref_cache_entry *ent;
1866 0           ent = mctx->bkref_ents + limits->elems[lim_idx];
1867 0           subexp_idx = dfa->nodes[ent->node].opr.idx;
1868              
1869 0           dst_pos = check_dst_limits_calc_pos (aTHX_ mctx, limits->elems[lim_idx],
1870             subexp_idx, dst_node, dst_idx,
1871             dst_bkref_idx);
1872 0           src_pos = check_dst_limits_calc_pos (aTHX_ mctx, limits->elems[lim_idx],
1873             subexp_idx, src_node, src_idx,
1874             src_bkref_idx);
1875              
1876             /* In case of:
1877             ( )
1878             ( )
1879             ( ) */
1880 0 0         if (src_pos == dst_pos)
1881 0           continue; /* This is unrelated limitation. */
1882             else
1883 0           return true;
1884             }
1885 0           return false;
1886             }
1887              
1888             static int
1889             internal_function
1890 0           check_dst_limits_calc_pos_1 (pTHX_ const re_match_context_t *mctx, int boundaries,
1891             Idx subexp_idx, Idx from_node, Idx bkref_idx)
1892             {
1893 0           const re_dfa_t *const dfa = mctx->dfa;
1894 0           const re_node_set *eclosures = dfa->eclosures + from_node;
1895             Idx node_idx;
1896              
1897             /* Else, we are on the boundary: examine the nodes on the epsilon
1898             closure. */
1899 0 0         for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1900             {
1901 0           Idx node = eclosures->elems[node_idx];
1902 0           switch (dfa->nodes[node].type)
1903             {
1904             case OP_BACK_REF:
1905 0 0         if (bkref_idx != REG_MISSING)
1906             {
1907 0           struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1908             do
1909             {
1910             Idx dst;
1911             int cpos;
1912              
1913 0 0         if (ent->node != node)
1914 0           continue;
1915              
1916 0 0         if (subexp_idx < BITSET_WORD_BITS
1917 0 0         && !(ent->eps_reachable_subexps_map
1918             & ((bitset_word_t) 1 << subexp_idx)))
1919 0           continue;
1920              
1921             /* Recurse trying to reach the OP_OPEN_SUBEXP and
1922             OP_CLOSE_SUBEXP cases below. But, if the
1923             destination node is the same node as the source
1924             node, don't recurse because it would cause an
1925             infinite loop: a regex that exhibits this behavior
1926             is ()\1*\1* */
1927 0           dst = dfa->edests[node].elems[0];
1928 0 0         if (dst == from_node)
1929             {
1930 0 0         if (boundaries & 1)
1931 0           return -1;
1932             else /* if (boundaries & 2) */
1933 0           return 0;
1934             }
1935              
1936 0           cpos =
1937             check_dst_limits_calc_pos_1 (aTHX_ mctx, boundaries, subexp_idx,
1938             dst, bkref_idx);
1939 0 0         if (cpos == -1 /* && (boundaries & 1) */)
1940 0           return -1;
1941 0 0         if (cpos == 0 && (boundaries & 2))
    0          
1942 0           return 0;
1943              
1944 0 0         if (subexp_idx < BITSET_WORD_BITS)
1945             ent->eps_reachable_subexps_map
1946 0           &= ~((bitset_word_t) 1 << subexp_idx);
1947             }
1948 0 0         while (ent++->more);
1949             }
1950 0           break;
1951              
1952             case OP_OPEN_SUBEXP:
1953 0 0         if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
    0          
1954 0           return -1;
1955 0           break;
1956              
1957             case OP_CLOSE_SUBEXP:
1958 0 0         if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
    0          
1959 0           return 0;
1960 0           break;
1961              
1962             default:
1963 0           break;
1964             }
1965             }
1966              
1967 0           return (boundaries & 2) ? 1 : 0;
1968             }
1969              
1970             static int
1971             internal_function
1972 0           check_dst_limits_calc_pos (pTHX_ const re_match_context_t *mctx, Idx limit,
1973             Idx subexp_idx, Idx from_node, Idx str_idx,
1974             Idx bkref_idx)
1975             {
1976 0           struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1977             int boundaries;
1978              
1979             /* If we are outside the range of the subexpression, return -1 or 1. */
1980 0 0         if (str_idx < lim->subexp_from)
1981 0           return -1;
1982              
1983 0 0         if (lim->subexp_to < str_idx)
1984 0           return 1;
1985              
1986             /* If we are within the subexpression, return 0. */
1987 0           boundaries = (str_idx == lim->subexp_from);
1988 0 0         boundaries |= (str_idx == lim->subexp_to) << 1;
1989 0 0         if (boundaries == 0)
1990 0           return 0;
1991              
1992             /* Else, examine epsilon closure. */
1993 0           return check_dst_limits_calc_pos_1 (aTHX_ mctx, boundaries, subexp_idx,
1994             from_node, bkref_idx);
1995             }
1996              
1997             /* Check the limitations of sub expressions LIMITS, and remove the nodes
1998             which are against limitations from DEST_NODES. */
1999              
2000             static reg_errcode_t
2001             internal_function
2002 0           check_subexp_limits (pTHX_ const re_dfa_t *dfa, re_node_set *dest_nodes,
2003             const re_node_set *candidates, re_node_set *limits,
2004             struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2005             {
2006             reg_errcode_t err;
2007             Idx node_idx, lim_idx;
2008              
2009 0 0         for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2010             {
2011             Idx subexp_idx;
2012             struct re_backref_cache_entry *ent;
2013 0           ent = bkref_ents + limits->elems[lim_idx];
2014              
2015 0 0         if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
    0          
2016 0           continue; /* This is unrelated limitation. */
2017              
2018 0           subexp_idx = dfa->nodes[ent->node].opr.idx;
2019 0 0         if (ent->subexp_to == str_idx)
2020             {
2021 0           Idx ops_node = REG_MISSING;
2022 0           Idx cls_node = REG_MISSING;
2023 0 0         for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2024             {
2025 0           Idx node = dest_nodes->elems[node_idx];
2026 0           re_token_type_t type = dfa->nodes[node].type;
2027 0 0         if (type == OP_OPEN_SUBEXP
2028 0 0         && subexp_idx == dfa->nodes[node].opr.idx)
2029 0           ops_node = node;
2030 0 0         else if (type == OP_CLOSE_SUBEXP
2031 0 0         && subexp_idx == dfa->nodes[node].opr.idx)
2032 0           cls_node = node;
2033             }
2034              
2035             /* Check the limitation of the open subexpression. */
2036             /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2037 0 0         if (REG_VALID_INDEX (ops_node))
2038             {
2039 0           err = sub_epsilon_src_nodes (aTHX_ dfa, ops_node, dest_nodes,
2040             candidates);
2041 0 0         if (BE (err != REG_NOERROR, 0))
2042 0           return err;
2043             }
2044              
2045             /* Check the limitation of the close subexpression. */
2046 0 0         if (REG_VALID_INDEX (cls_node))
2047 0 0         for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2048             {
2049 0           Idx node = dest_nodes->elems[node_idx];
2050 0 0         if (!re_node_set_contains (aTHX_ dfa->inveclosures + node,
2051             cls_node)
2052 0 0         && !re_node_set_contains (aTHX_ dfa->eclosures + node,
2053             cls_node))
2054             {
2055             /* It is against this limitation.
2056             Remove it form the current sifted state. */
2057 0           err = sub_epsilon_src_nodes (aTHX_ dfa, node, dest_nodes,
2058             candidates);
2059 0 0         if (BE (err != REG_NOERROR, 0))
2060 0           return err;
2061 0           --node_idx;
2062             }
2063             }
2064             }
2065             else /* (ent->subexp_to != str_idx) */
2066             {
2067 0 0         for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2068             {
2069 0           Idx node = dest_nodes->elems[node_idx];
2070 0           re_token_type_t type = dfa->nodes[node].type;
2071 0 0         if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
    0          
2072             {
2073 0 0         if (subexp_idx != dfa->nodes[node].opr.idx)
2074 0           continue;
2075             /* It is against this limitation.
2076             Remove it form the current sifted state. */
2077 0           err = sub_epsilon_src_nodes (aTHX_ dfa, node, dest_nodes,
2078             candidates);
2079 0 0         if (BE (err != REG_NOERROR, 0))
2080 0           return err;
2081             }
2082             }
2083             }
2084             }
2085 0           return REG_NOERROR;
2086             }
2087              
2088             static reg_errcode_t
2089             internal_function __attribute_warn_unused_result__
2090 0           sift_states_bkref (pTHX_ const re_match_context_t *mctx, re_sift_context_t *sctx,
2091             Idx str_idx, const re_node_set *candidates)
2092             {
2093 0           const re_dfa_t *const dfa = mctx->dfa;
2094             reg_errcode_t err;
2095             Idx node_idx, node;
2096             re_sift_context_t local_sctx;
2097 0           Idx first_idx = search_cur_bkref_entry (aTHX_ mctx, str_idx);
2098              
2099 0 0         if (first_idx == REG_MISSING)
2100 0           return REG_NOERROR;
2101              
2102 0           local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2103              
2104 0 0         for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2105             {
2106             Idx enabled_idx;
2107             re_token_type_t type;
2108             struct re_backref_cache_entry *entry;
2109 0           node = candidates->elems[node_idx];
2110 0           type = dfa->nodes[node].type;
2111             /* Avoid infinite loop for the REs like "()\1+". */
2112 0 0         if (node == sctx->last_node && str_idx == sctx->last_str_idx)
    0          
2113 0           continue;
2114 0 0         if (type != OP_BACK_REF)
2115 0           continue;
2116              
2117 0           entry = mctx->bkref_ents + first_idx;
2118 0           enabled_idx = first_idx;
2119             do
2120             {
2121             Idx subexp_len;
2122             Idx to_idx;
2123             Idx dst_node;
2124             bool ok;
2125             re_dfastate_t *cur_state;
2126              
2127 0 0         if (entry->node != node)
2128 0           continue;
2129 0           subexp_len = entry->subexp_to - entry->subexp_from;
2130 0           to_idx = str_idx + subexp_len;
2131 0           dst_node = (subexp_len ? dfa->nexts[node]
2132 0 0         : dfa->edests[node].elems[0]);
2133              
2134 0 0         if (to_idx > sctx->last_str_idx
2135 0 0         || sctx->sifted_states[to_idx] == NULL
2136 0 0         || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
    0          
2137 0 0         || check_dst_limits (aTHX_ mctx, &sctx->limits, node,
2138             str_idx, dst_node, to_idx))
2139 0           continue;
2140              
2141 0 0         if (local_sctx.sifted_states == NULL)
2142             {
2143 0           local_sctx = *sctx;
2144 0           err = re_node_set_init_copy (aTHX_ &local_sctx.limits, &sctx->limits);
2145 0 0         if (BE (err != REG_NOERROR, 0))
2146 0           goto free_return;
2147             }
2148 0           local_sctx.last_node = node;
2149 0           local_sctx.last_str_idx = str_idx;
2150 0           ok = re_node_set_insert (aTHX_ &local_sctx.limits, enabled_idx);
2151 0 0         if (BE (! ok, 0))
2152             {
2153 0           err = REG_ESPACE;
2154 0           goto free_return;
2155             }
2156 0           cur_state = local_sctx.sifted_states[str_idx];
2157 0           err = sift_states_backward (aTHX_ mctx, &local_sctx);
2158 0 0         if (BE (err != REG_NOERROR, 0))
2159 0           goto free_return;
2160 0 0         if (sctx->limited_states != NULL)
2161             {
2162 0           err = merge_state_array (aTHX_ dfa, sctx->limited_states,
2163             local_sctx.sifted_states,
2164             str_idx + 1);
2165 0 0         if (BE (err != REG_NOERROR, 0))
2166 0           goto free_return;
2167             }
2168 0           local_sctx.sifted_states[str_idx] = cur_state;
2169 0           re_node_set_remove (aTHX_ &local_sctx.limits, enabled_idx);
2170              
2171             /* mctx->bkref_ents may have changed, reload the pointer. */
2172 0           entry = mctx->bkref_ents + enabled_idx;
2173             }
2174 0 0         while (enabled_idx++, entry++->more);
2175             }
2176 0           err = REG_NOERROR;
2177             free_return:
2178 0 0         if (local_sctx.sifted_states != NULL)
2179             {
2180 0 0         re_node_set_free (&local_sctx.limits);
2181             }
2182              
2183 0           return err;
2184             }
2185              
2186              
2187             #ifdef RE_ENABLE_I18N
2188             static int
2189             internal_function
2190 9           sift_states_iter_mb (pTHX_ const re_match_context_t *mctx, re_sift_context_t *sctx,
2191             Idx node_idx, Idx str_idx, Idx max_str_idx)
2192             {
2193 9           const re_dfa_t *const dfa = mctx->dfa;
2194             int naccepted;
2195             /* Check the node can accept "multi byte". */
2196 9           naccepted = check_node_accept_bytes (aTHX_ dfa, node_idx, &mctx->input, mctx->sv, str_idx);
2197 9 50         if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
    50          
    50          
2198 9 50         !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2199             dfa->nexts[node_idx]))
2200             /* The node can't accept the "multi byte", or the
2201             destination was already thrown away, then the node
2202             could't accept the current input "multi byte". */
2203 0           naccepted = 0;
2204             /* Otherwise, it is sure that the node could accept
2205             'naccepted' bytes input. */
2206 9           return naccepted;
2207             }
2208             #endif /* RE_ENABLE_I18N */
2209              
2210            
2211             /* Functions for state transition. */
2212              
2213             /* Return the next state to which the current state STATE will transit by
2214             accepting the current input byte, and update STATE_LOG if necessary.
2215             If STATE can accept a multibyte char/collating element/back reference
2216             update the destination of STATE_LOG. */
2217              
2218             static re_dfastate_t *
2219             internal_function __attribute_warn_unused_result__
2220 61           transit_state (pTHX_ reg_errcode_t *err, re_match_context_t *mctx,
2221             re_dfastate_t *state)
2222             {
2223             re_dfastate_t **trtable;
2224             unsigned char ch;
2225              
2226             #ifdef RE_ENABLE_I18N
2227             /* If the current state can accept multibyte. */
2228 61 100         if (BE (state->accept_mb, 0))
2229             {
2230 17           *err = transit_state_mb (aTHX_ mctx, state);
2231 17 50         if (BE (*err != REG_NOERROR, 0))
2232 0           return NULL;
2233             }
2234             #endif /* RE_ENABLE_I18N */
2235              
2236             /* Then decide the next state with the single byte. */
2237             #if 0
2238             if (0)
2239             /* don't use transition table */
2240             return transit_state_sb (err, mctx, state);
2241             #endif
2242              
2243             /* Use transition table */
2244 61           ch = re_string_fetch_byte (&mctx->input);
2245             for (;;)
2246             {
2247 80           trtable = state->trtable;
2248 80 100         if (BE (trtable != NULL, 1))
2249 61           return trtable[ch];
2250              
2251 19           trtable = state->word_trtable;
2252 19 50         if (BE (trtable != NULL, 1))
2253             {
2254             unsigned int context;
2255             context
2256 0           = re_string_context_at (aTHX_ &mctx->input,
2257 0           re_string_cur_idx (&mctx->input) - 1,
2258             mctx->eflags);
2259 0 0         if (IS_WORD_CONTEXT (context))
2260 0           return trtable[ch + SBC_MAX];
2261             else
2262 0           return trtable[ch];
2263             }
2264              
2265 19 50         if (!build_trtable (aTHX_ mctx->dfa, state))
2266             {
2267 0           *err = REG_ESPACE;
2268 0           return NULL;
2269             }
2270              
2271             /* Retry, we now have a transition table. */
2272 19           }
2273             }
2274              
2275             /* Update the state_log if we need */
2276             static re_dfastate_t *
2277             internal_function
2278 62           merge_state_with_log (pTHX_ reg_errcode_t *err, re_match_context_t *mctx,
2279             re_dfastate_t *next_state)
2280             {
2281 62           const re_dfa_t *const dfa = mctx->dfa;
2282 62           Idx cur_idx = re_string_cur_idx (&mctx->input);
2283              
2284 62 100         if (cur_idx > mctx->state_log_top)
2285             {
2286 30           mctx->state_log[cur_idx] = next_state;
2287 30           mctx->state_log_top = cur_idx;
2288             }
2289 32 100         else if (mctx->state_log[cur_idx] == 0)
2290             {
2291 16           mctx->state_log[cur_idx] = next_state;
2292             }
2293             else
2294             {
2295             re_dfastate_t *pstate;
2296             unsigned int context;
2297 16           re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2298             /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2299             the destination of a multibyte char/collating element/
2300             back reference. Then the next state is the union set of
2301             these destinations and the results of the transition table. */
2302 16           pstate = mctx->state_log[cur_idx];
2303 16           log_nodes = pstate->entrance_nodes;
2304 16 50         if (next_state != NULL)
2305             {
2306 0           table_nodes = next_state->entrance_nodes;
2307 0           *err = re_node_set_init_union (aTHX_ &next_nodes, table_nodes,
2308             log_nodes);
2309 0 0         if (BE (*err != REG_NOERROR, 0))
2310 0           return NULL;
2311             }
2312             else
2313 16           next_nodes = *log_nodes;
2314             /* Note: We already add the nodes of the initial state,
2315             then we don't need to add them here. */
2316              
2317 16           context = re_string_context_at (aTHX_ &mctx->input,
2318 16           re_string_cur_idx (&mctx->input) - 1,
2319             mctx->eflags);
2320 32           next_state = mctx->state_log[cur_idx]
2321 16           = re_acquire_state_context (aTHX_ err, dfa, &next_nodes, context);
2322             /* We don't need to check errors here, since the return value of
2323             this function is next_state and ERR is already set. */
2324              
2325 16 50         if (table_nodes != NULL)
2326 16 0         re_node_set_free (&next_nodes);
2327             }
2328              
2329 62 50         if (BE (dfa->nbackref, 0) && next_state != NULL)
    0          
2330             {
2331             /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2332             later. We must check them here, since the back references in the
2333             next state might use them. */
2334 0           *err = check_subexp_matching_top (aTHX_ mctx, &next_state->nodes,
2335             cur_idx);
2336 0 0         if (BE (*err != REG_NOERROR, 0))
2337 0           return NULL;
2338              
2339             /* If the next state has back references. */
2340 0 0         if (next_state->has_backref)
2341             {
2342 0           *err = transit_state_bkref (aTHX_ mctx, &next_state->nodes);
2343 0 0         if (BE (*err != REG_NOERROR, 0))
2344 0           return NULL;
2345 0           next_state = mctx->state_log[cur_idx];
2346             }
2347             }
2348              
2349 62           return next_state;
2350             }
2351              
2352             /* Skip bytes in the input that correspond to part of a
2353             multi-byte match, then look in the log for a state
2354             from which to restart matching. */
2355             static re_dfastate_t *
2356             internal_function
2357 33           find_recover_state (pTHX_ reg_errcode_t *err, re_match_context_t *mctx)
2358             {
2359             re_dfastate_t *cur_state;
2360             do
2361             {
2362 33           Idx max = mctx->state_log_top;
2363 33           Idx cur_str_idx = re_string_cur_idx (aTHX_ &mctx->input);
2364              
2365             do
2366             {
2367 49 100         if (++cur_str_idx > max)
2368 17           return NULL;
2369 32           re_string_skip_bytes (aTHX_ &mctx->input, 1);
2370             }
2371 32 100         while (mctx->state_log[cur_str_idx] == NULL);
2372              
2373 16           cur_state = merge_state_with_log (aTHX_ err, mctx, NULL);
2374             }
2375 16 50         while (*err == REG_NOERROR && cur_state == NULL);
    50          
2376 16           return cur_state;
2377             }
2378              
2379             /* Helper functions for transit_state. */
2380              
2381             /* From the node set CUR_NODES, pick up the nodes whose types are
2382             OP_OPEN_SUBEXP and which have corresponding back references in the regular
2383             expression. And register them to use them later for evaluating the
2384             corresponding back references. */
2385              
2386             static reg_errcode_t
2387             internal_function
2388 0           check_subexp_matching_top (pTHX_ re_match_context_t *mctx, re_node_set *cur_nodes,
2389             Idx str_idx)
2390             {
2391 0           const re_dfa_t *const dfa = mctx->dfa;
2392             Idx node_idx;
2393             reg_errcode_t err;
2394              
2395             /* TODO: This isn't efficient.
2396             Because there might be more than one nodes whose types are
2397             OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2398             nodes.
2399             E.g. RE: (a){2} */
2400 0 0         for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2401             {
2402 0           Idx node = cur_nodes->elems[node_idx];
2403 0 0         if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2404 0 0         && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2405 0 0         && (dfa->used_bkref_map
2406 0           & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2407             {
2408 0           err = match_ctx_add_subtop (aTHX_ mctx, node, str_idx);
2409 0 0         if (BE (err != REG_NOERROR, 0))
2410 0           return err;
2411             }
2412             }
2413 0           return REG_NOERROR;
2414             }
2415              
2416             #if 0
2417             /* Return the next state to which the current state STATE will transit by
2418             accepting the current input byte. */
2419              
2420             static re_dfastate_t *
2421             transit_state_sb (pTHX_ reg_errcode_t *err, re_match_context_t *mctx,
2422             re_dfastate_t *state)
2423             {
2424             const re_dfa_t *const dfa = mctx->dfa;
2425             re_node_set next_nodes;
2426             re_dfastate_t *next_state;
2427             Idx node_cnt, cur_str_idx = re_string_cur_idx (aTHX_ &mctx->input);
2428             unsigned int context;
2429              
2430             *err = re_node_set_alloc (&aTHX_ next_nodes, state->nodes.nelem + 1);
2431             if (BE (*err != REG_NOERROR, 0))
2432             return NULL;
2433             for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2434             {
2435             Idx cur_node = state->nodes.elems[node_cnt];
2436             if (check_node_accept (aTHX_ mctx, dfa->nodes + cur_node, cur_str_idx))
2437             {
2438             *err = re_node_set_merge (aTHX_ &next_nodes,
2439             dfa->eclosures + dfa->nexts[cur_node]);
2440             if (BE (*err != REG_NOERROR, 0))
2441             {
2442             re_node_set_free (&next_nodes);
2443             return NULL;
2444             }
2445             }
2446             }
2447             context = re_string_context_at (aTHX_ &mctx->input, cur_str_idx, mctx->eflags);
2448             next_state = re_acquire_state_context (aTHX_ err, dfa, &next_nodes, context);
2449             /* We don't need to check errors here, since the return value of
2450             this function is next_state and ERR is already set. */
2451              
2452             re_node_set_free (&next_nodes);
2453             re_string_skip_bytes (aTHX_ &mctx->input, 1);
2454             return next_state;
2455             }
2456             #endif
2457              
2458             #ifdef RE_ENABLE_I18N
2459             static reg_errcode_t
2460             internal_function
2461 17           transit_state_mb (pTHX_ re_match_context_t *mctx, re_dfastate_t *pstate)
2462             {
2463 17           const re_dfa_t *const dfa = mctx->dfa;
2464             reg_errcode_t err;
2465             Idx i;
2466              
2467 85 100         for (i = 0; i < pstate->nodes.nelem; ++i)
2468             {
2469             re_node_set dest_nodes, *new_nodes;
2470 68           Idx cur_node_idx = pstate->nodes.elems[i];
2471             int naccepted;
2472             Idx dest_idx;
2473             unsigned int context;
2474             re_dfastate_t *dest_state;
2475              
2476 68 100         if (!dfa->nodes[cur_node_idx].accept_mb)
2477 52           continue;
2478              
2479 17 50         if (dfa->nodes[cur_node_idx].constraint)
2480             {
2481 0           context = re_string_context_at (aTHX_ &mctx->input,
2482             re_string_cur_idx (&mctx->input),
2483             mctx->eflags);
2484 0 0         if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
    0          
    0          
    0          
    0          
    0          
    0          
    0          
2485             context))
2486 0           continue;
2487             }
2488              
2489             /* How many bytes the node can accept? */
2490 17           naccepted = check_node_accept_bytes (aTHX_ dfa, cur_node_idx, &mctx->input, mctx->sv,
2491             re_string_cur_idx (&mctx->input));
2492 17 100         if (naccepted == 0)
2493 1           continue;
2494              
2495             /* The node can accepts 'naccepted' bytes. */
2496 16           dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2497 16           mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2498 16           : mctx->max_mb_elem_len);
2499 16           err = clean_state_log_if_needed (aTHX_ mctx, dest_idx);
2500 16 50         if (BE (err != REG_NOERROR, 0))
2501 0           return err;
2502             #ifdef DEBUG
2503             assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2504             #endif
2505 16           new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2506              
2507 16           dest_state = mctx->state_log[dest_idx];
2508 16 50         if (dest_state == NULL)
2509 16           dest_nodes = *new_nodes;
2510             else
2511             {
2512 0           err = re_node_set_init_union (aTHX_ &dest_nodes,
2513 0           dest_state->entrance_nodes, new_nodes);
2514 0 0         if (BE (err != REG_NOERROR, 0))
2515 0           return err;
2516             }
2517 16           context = re_string_context_at (aTHX_ &mctx->input, dest_idx - 1,
2518             mctx->eflags);
2519 16           mctx->state_log[dest_idx]
2520 16           = re_acquire_state_context (aTHX_ &err, dfa, &dest_nodes, context);
2521 16 50         if (dest_state != NULL)
2522 0 0         re_node_set_free (&dest_nodes);
2523 16 50         if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
    0          
2524 16           return err;
2525             }
2526 17           return REG_NOERROR;
2527             }
2528             #endif /* RE_ENABLE_I18N */
2529              
2530             static reg_errcode_t
2531             internal_function
2532 0           transit_state_bkref (pTHX_ re_match_context_t *mctx, const re_node_set *nodes)
2533             {
2534 0           const re_dfa_t *const dfa = mctx->dfa;
2535             reg_errcode_t err;
2536             Idx i;
2537 0           Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2538              
2539 0 0         for (i = 0; i < nodes->nelem; ++i)
2540             {
2541             Idx dest_str_idx, prev_nelem, bkc_idx;
2542 0           Idx node_idx = nodes->elems[i];
2543             unsigned int context;
2544 0           const re_token_t *node = dfa->nodes + node_idx;
2545             re_node_set *new_dest_nodes;
2546              
2547             /* Check whether 'node' is a backreference or not. */
2548 0 0         if (node->type != OP_BACK_REF)
2549 0           continue;
2550              
2551 0 0         if (node->constraint)
2552             {
2553 0           context = re_string_context_at (aTHX_ &mctx->input, cur_str_idx,
2554             mctx->eflags);
2555 0 0         if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
    0          
    0          
    0          
    0          
    0          
    0          
    0          
2556 0           continue;
2557             }
2558              
2559             /* 'node' is a backreference.
2560             Check the substring which the substring matched. */
2561 0           bkc_idx = mctx->nbkref_ents;
2562 0           err = get_subexp (aTHX_ mctx, node_idx, cur_str_idx);
2563 0 0         if (BE (err != REG_NOERROR, 0))
2564 0           goto free_return;
2565              
2566             /* And add the epsilon closures (which is 'new_dest_nodes') of
2567             the backreference to appropriate state_log. */
2568             #ifdef DEBUG
2569             assert (dfa->nexts[node_idx] != REG_MISSING);
2570             #endif
2571 0 0         for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2572             {
2573             Idx subexp_len;
2574             re_dfastate_t *dest_state;
2575             struct re_backref_cache_entry *bkref_ent;
2576 0           bkref_ent = mctx->bkref_ents + bkc_idx;
2577 0 0         if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
    0          
2578 0           continue;
2579 0           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2580 0           new_dest_nodes = (subexp_len == 0
2581 0           ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2582 0 0         : dfa->eclosures + dfa->nexts[node_idx]);
2583 0           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2584 0           - bkref_ent->subexp_from);
2585 0           context = re_string_context_at (aTHX_ &mctx->input, dest_str_idx - 1,
2586             mctx->eflags);
2587 0           dest_state = mctx->state_log[dest_str_idx];
2588 0           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2589 0 0         : mctx->state_log[cur_str_idx]->nodes.nelem);
2590             /* Add 'new_dest_node' to state_log. */
2591 0 0         if (dest_state == NULL)
2592             {
2593 0           mctx->state_log[dest_str_idx]
2594 0           = re_acquire_state_context (aTHX_ &err, dfa, new_dest_nodes,
2595             context);
2596 0 0         if (BE (mctx->state_log[dest_str_idx] == NULL
    0          
2597             && err != REG_NOERROR, 0))
2598 0           goto free_return;
2599             }
2600             else
2601             {
2602             re_node_set dest_nodes;
2603 0           err = re_node_set_init_union (aTHX_ &dest_nodes,
2604 0           dest_state->entrance_nodes,
2605             new_dest_nodes);
2606 0 0         if (BE (err != REG_NOERROR, 0))
2607             {
2608 0 0         re_node_set_free (&dest_nodes);
2609 0           goto free_return;
2610             }
2611 0           mctx->state_log[dest_str_idx]
2612 0           = re_acquire_state_context (aTHX_ &err, dfa, &dest_nodes, context);
2613 0 0         re_node_set_free (&dest_nodes);
2614 0 0         if (BE (mctx->state_log[dest_str_idx] == NULL
    0          
2615             && err != REG_NOERROR, 0))
2616 0           goto free_return;
2617             }
2618             /* We need to check recursively if the backreference can epsilon
2619             transit. */
2620 0 0         if (subexp_len == 0
2621 0 0         && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2622             {
2623 0           err = check_subexp_matching_top (aTHX_ mctx, new_dest_nodes,
2624             cur_str_idx);
2625 0 0         if (BE (err != REG_NOERROR, 0))
2626 0           goto free_return;
2627 0           err = transit_state_bkref (aTHX_ mctx, new_dest_nodes);
2628 0 0         if (BE (err != REG_NOERROR, 0))
2629 0           goto free_return;
2630             }
2631             }
2632             }
2633 0           err = REG_NOERROR;
2634             free_return:
2635 0           return err;
2636             }
2637              
2638             /* Enumerate all the candidates which the backreference BKREF_NODE can match
2639             at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2640             Note that we might collect inappropriate candidates here.
2641             However, the cost of checking them strictly here is too high, then we
2642             delay these checking for prune_impossible_nodes(). */
2643              
2644             static reg_errcode_t
2645             internal_function __attribute_warn_unused_result__
2646 0           get_subexp (pTHX_ re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2647             {
2648 0           const re_dfa_t *const dfa = mctx->dfa;
2649             Idx subexp_num, sub_top_idx;
2650 0           const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2651             /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2652 0           Idx cache_idx = search_cur_bkref_entry (aTHX_ mctx, bkref_str_idx);
2653 0 0         if (cache_idx != REG_MISSING)
2654             {
2655 0           const struct re_backref_cache_entry *entry
2656 0           = mctx->bkref_ents + cache_idx;
2657             do
2658 0 0         if (entry->node == bkref_node)
2659 0           return REG_NOERROR; /* We already checked it. */
2660 0 0         while (entry++->more);
2661             }
2662              
2663 0           subexp_num = dfa->nodes[bkref_node].opr.idx;
2664              
2665             /* For each sub expression */
2666 0 0         for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2667             {
2668             reg_errcode_t err;
2669 0           re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2670             re_sub_match_last_t *sub_last;
2671             Idx sub_last_idx, sl_str, bkref_str_off;
2672              
2673 0 0         if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2674 0           continue; /* It isn't related. */
2675              
2676 0           sl_str = sub_top->str_idx;
2677 0           bkref_str_off = bkref_str_idx;
2678             /* At first, check the last node of sub expressions we already
2679             evaluated. */
2680 0 0         for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2681             {
2682             regoff_t sl_str_diff;
2683 0           sub_last = sub_top->lasts[sub_last_idx];
2684 0           sl_str_diff = sub_last->str_idx - sl_str;
2685             /* The matched string by the sub expression match with the substring
2686             at the back reference? */
2687 0 0         if (sl_str_diff > 0)
2688             {
2689 0 0         if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2690             {
2691             /* Not enough chars for a successful match. */
2692 0 0         if (bkref_str_off + sl_str_diff > mctx->input.len)
2693 0           break;
2694              
2695 0           err = clean_state_log_if_needed (aTHX_ mctx,
2696             bkref_str_off
2697             + sl_str_diff);
2698 0 0         if (BE (err != REG_NOERROR, 0))
2699 0           return err;
2700 0           buf = (const char *) re_string_get_buffer (&mctx->input);
2701             }
2702 0 0         if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2703             /* We don't need to search this sub expression any more. */
2704 0           break;
2705             }
2706 0           bkref_str_off += sl_str_diff;
2707 0           sl_str += sl_str_diff;
2708 0           err = get_subexp_sub (aTHX_ mctx, sub_top, sub_last, bkref_node,
2709             bkref_str_idx);
2710              
2711             /* Reload buf, since the preceding call might have reallocated
2712             the buffer. */
2713 0           buf = (const char *) re_string_get_buffer (&mctx->input);
2714              
2715 0 0         if (err == REG_NOMATCH)
2716 0           continue;
2717 0 0         if (BE (err != REG_NOERROR, 0))
2718 0           return err;
2719             }
2720              
2721 0 0         if (sub_last_idx < sub_top->nlasts)
2722 0           continue;
2723 0 0         if (sub_last_idx > 0)
2724 0           ++sl_str;
2725             /* Then, search for the other last nodes of the sub expression. */
2726 0 0         for (; sl_str <= bkref_str_idx; ++sl_str)
2727             {
2728             Idx cls_node;
2729             regoff_t sl_str_off;
2730             const re_node_set *nodes;
2731 0           sl_str_off = sl_str - sub_top->str_idx;
2732             /* The matched string by the sub expression match with the substring
2733             at the back reference? */
2734 0 0         if (sl_str_off > 0)
2735             {
2736 0 0         if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2737             {
2738             /* If we are at the end of the input, we cannot match. */
2739 0 0         if (bkref_str_off >= mctx->input.len)
2740 0           break;
2741              
2742 0           err = extend_buffers (aTHX_ mctx, bkref_str_off + 1);
2743 0 0         if (BE (err != REG_NOERROR, 0))
2744 0           return err;
2745              
2746 0           buf = (const char *) re_string_get_buffer (aTHX_ &mctx->input);
2747             }
2748 0 0         if (buf [bkref_str_off++] != buf[sl_str - 1])
2749 0           break; /* We don't need to search this sub expression
2750             any more. */
2751             }
2752 0 0         if (mctx->state_log[sl_str] == NULL)
2753 0           continue;
2754             /* Does this state have a ')' of the sub expression? */
2755 0           nodes = &mctx->state_log[sl_str]->nodes;
2756 0           cls_node = find_subexp_node (aTHX_ dfa, nodes, subexp_num,
2757             OP_CLOSE_SUBEXP);
2758 0 0         if (cls_node == REG_MISSING)
2759 0           continue; /* No. */
2760 0 0         if (sub_top->path == NULL)
2761             {
2762 0 0         re_calloc(sub_top->path, state_array_t, sl_str - sub_top->str_idx + 1);
2763             }
2764             /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2765             in the current context? */
2766 0           err = check_arrival (aTHX_ mctx, sub_top->path, sub_top->node,
2767             sub_top->str_idx, cls_node, sl_str,
2768             OP_CLOSE_SUBEXP);
2769 0 0         if (err == REG_NOMATCH)
2770 0           continue;
2771 0 0         if (BE (err != REG_NOERROR, 0))
2772 0           return err;
2773 0           sub_last = match_ctx_add_sublast (aTHX_ sub_top, cls_node, sl_str);
2774 0 0         if (BE (sub_last == NULL, 0))
2775 0           return REG_ESPACE;
2776 0           err = get_subexp_sub (aTHX_ mctx, sub_top, sub_last, bkref_node,
2777             bkref_str_idx);
2778 0 0         if (err == REG_NOMATCH)
2779 0           continue;
2780             }
2781             }
2782 0           return REG_NOERROR;
2783             }
2784              
2785             /* Helper functions for get_subexp(). */
2786              
2787             /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2788             If it can arrive, register the sub expression expressed with SUB_TOP
2789             and SUB_LAST. */
2790              
2791             static reg_errcode_t
2792             internal_function
2793 0           get_subexp_sub (pTHX_ re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2794             re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2795             {
2796             reg_errcode_t err;
2797             Idx to_idx;
2798             /* Can the subexpression arrive the back reference? */
2799 0           err = check_arrival (aTHX_ mctx, &sub_last->path, sub_last->node,
2800             sub_last->str_idx, bkref_node, bkref_str,
2801             OP_OPEN_SUBEXP);
2802 0 0         if (err != REG_NOERROR)
2803 0           return err;
2804 0           err = match_ctx_add_entry (aTHX_ mctx, bkref_node, bkref_str, sub_top->str_idx,
2805             sub_last->str_idx);
2806 0 0         if (BE (err != REG_NOERROR, 0))
2807 0           return err;
2808 0           to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2809 0           return clean_state_log_if_needed (aTHX_ mctx, to_idx);
2810             }
2811              
2812             /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2813             Search '(' if FL_OPEN, or search ')' otherwise.
2814             TODO: This function isn't efficient...
2815             Because there might be more than one nodes whose types are
2816             OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2817             nodes.
2818             E.g. RE: (a){2} */
2819              
2820             static Idx
2821             internal_function
2822 0           find_subexp_node (pTHX_ const re_dfa_t *dfa, const re_node_set *nodes,
2823             Idx subexp_idx, int type)
2824             {
2825             Idx cls_idx;
2826 0 0         for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2827             {
2828 0           Idx cls_node = nodes->elems[cls_idx];
2829 0           const re_token_t *node = dfa->nodes + cls_node;
2830 0 0         if (node->type == type
2831 0 0         && node->opr.idx == subexp_idx)
2832 0           return cls_node;
2833             }
2834 0           return REG_MISSING;
2835             }
2836              
2837             /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2838             LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2839             heavily reused.
2840             Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2841              
2842             static reg_errcode_t
2843             internal_function __attribute_warn_unused_result__
2844 0           check_arrival (pTHX_ re_match_context_t *mctx, state_array_t *path, Idx top_node,
2845             Idx top_str, Idx last_node, Idx last_str, int type)
2846             {
2847 0           const re_dfa_t *const dfa = mctx->dfa;
2848 0           reg_errcode_t err = REG_NOERROR;
2849             Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2850 0           re_dfastate_t *cur_state = NULL;
2851             re_node_set *cur_nodes, next_nodes;
2852             re_dfastate_t **backup_state_log;
2853             unsigned int context;
2854              
2855 0           subexp_num = dfa->nodes[top_node].opr.idx;
2856             /* Extend the buffer if we need. */
2857 0 0         if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858             {
2859 0           Idx old_alloc = path->alloc;
2860 0           Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2861             Idx new_alloc;
2862 0 0         if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2863 0           return REG_ESPACE;
2864 0           new_alloc = old_alloc + incr_alloc;
2865 0 0         if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2866 0           return REG_ESPACE;
2867 0 0         re_realloc (path->array, re_dfastate_t *, new_alloc);
2868 0           path->alloc = new_alloc;
2869 0           memset (path->array + old_alloc, '\0',
2870 0           sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2871             }
2872              
2873 0 0         str_idx = path->next_idx ? path->next_idx : top_str;
2874              
2875             /* Temporary modify MCTX. */
2876 0           backup_state_log = mctx->state_log;
2877 0           backup_cur_idx = mctx->input.cur_idx;
2878 0           mctx->state_log = path->array;
2879 0           mctx->input.cur_idx = str_idx;
2880              
2881             /* Setup initial node set. */
2882 0           context = re_string_context_at (aTHX_ &mctx->input, str_idx - 1, mctx->eflags);
2883 0 0         if (str_idx == top_str)
2884             {
2885 0           err = re_node_set_init_1 (aTHX_ &next_nodes, top_node);
2886 0 0         if (BE (err != REG_NOERROR, 0))
2887 0           return err;
2888 0           err = check_arrival_expand_ecl (aTHX_ dfa, &next_nodes, subexp_num, type);
2889 0 0         if (BE (err != REG_NOERROR, 0))
2890             {
2891 0 0         re_node_set_free (&next_nodes);
2892 0           return err;
2893             }
2894             }
2895             else
2896             {
2897 0           cur_state = mctx->state_log[str_idx];
2898 0 0         if (cur_state && cur_state->has_backref)
    0          
2899             {
2900 0           err = re_node_set_init_copy (aTHX_ &next_nodes, &cur_state->nodes);
2901 0 0         if (BE (err != REG_NOERROR, 0))
2902 0           return err;
2903             }
2904             else
2905 0           re_node_set_init_empty (&next_nodes);
2906             }
2907 0 0         if (str_idx == top_str || (cur_state && cur_state->has_backref))
    0          
    0          
2908             {
2909 0 0         if (next_nodes.nelem)
2910             {
2911 0           err = expand_bkref_cache (aTHX_ mctx, &next_nodes, str_idx,
2912             subexp_num, type);
2913 0 0         if (BE (err != REG_NOERROR, 0))
2914             {
2915 0 0         re_node_set_free (&next_nodes);
2916 0           return err;
2917             }
2918             }
2919 0           cur_state = re_acquire_state_context (aTHX_ &err, dfa, &next_nodes, context);
2920 0 0         if (BE (cur_state == NULL && err != REG_NOERROR, 0))
    0          
2921             {
2922 0 0         re_node_set_free (&next_nodes);
2923 0           return err;
2924             }
2925 0           mctx->state_log[str_idx] = cur_state;
2926             }
2927              
2928 0 0         for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
    0          
2929             {
2930 0           re_node_set_empty (aTHX_ &next_nodes);
2931 0 0         if (mctx->state_log[str_idx + 1])
2932             {
2933 0           err = re_node_set_merge (aTHX_ &next_nodes,
2934 0           &mctx->state_log[str_idx + 1]->nodes);
2935 0 0         if (BE (err != REG_NOERROR, 0))
2936             {
2937 0 0         re_node_set_free (&next_nodes);
2938 0           return err;
2939             }
2940             }
2941 0 0         if (cur_state)
2942             {
2943 0           err = check_arrival_add_next_nodes (aTHX_ mctx, str_idx,
2944             &cur_state->non_eps_nodes,
2945             &next_nodes);
2946 0 0         if (BE (err != REG_NOERROR, 0))
2947             {
2948 0 0         re_node_set_free (&next_nodes);
2949 0           return err;
2950             }
2951             }
2952 0           ++str_idx;
2953 0 0         if (next_nodes.nelem)
2954             {
2955 0           err = check_arrival_expand_ecl (aTHX_ dfa, &next_nodes, subexp_num, type);
2956 0 0         if (BE (err != REG_NOERROR, 0))
2957             {
2958 0 0         re_node_set_free (&next_nodes);
2959 0           return err;
2960             }
2961 0           err = expand_bkref_cache (aTHX_ mctx, &next_nodes, str_idx,
2962             subexp_num, type);
2963 0 0         if (BE (err != REG_NOERROR, 0))
2964             {
2965 0 0         re_node_set_free (&next_nodes);
2966 0           return err;
2967             }
2968             }
2969 0           context = re_string_context_at (aTHX_ &mctx->input, str_idx - 1, mctx->eflags);
2970 0           cur_state = re_acquire_state_context (aTHX_ &err, dfa, &next_nodes, context);
2971 0 0         if (BE (cur_state == NULL && err != REG_NOERROR, 0))
    0          
2972             {
2973 0 0         re_node_set_free (&next_nodes);
2974 0           return err;
2975             }
2976 0           mctx->state_log[str_idx] = cur_state;
2977 0 0         null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2978             }
2979 0 0         re_node_set_free (&next_nodes);
2980 0           cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2981 0 0         : &mctx->state_log[last_str]->nodes);
2982 0           path->next_idx = str_idx;
2983              
2984             /* Fix MCTX. */
2985 0           mctx->state_log = backup_state_log;
2986 0           mctx->input.cur_idx = backup_cur_idx;
2987              
2988             /* Then check the current node set has the node LAST_NODE. */
2989 0 0         if (cur_nodes != NULL && re_node_set_contains (aTHX_ cur_nodes, last_node))
    0          
2990 0           return REG_NOERROR;
2991              
2992 0           return REG_NOMATCH;
2993             }
2994              
2995             /* Helper functions for check_arrival. */
2996              
2997             /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2998             to NEXT_NODES.
2999             TODO: This function is similar to the functions transit_state*(),
3000             however this function has many additional works.
3001             Can't we unify them? */
3002              
3003             static reg_errcode_t
3004             internal_function __attribute_warn_unused_result__
3005 0           check_arrival_add_next_nodes (pTHX_ re_match_context_t *mctx, Idx str_idx,
3006             re_node_set *cur_nodes, re_node_set *next_nodes)
3007             {
3008 0           const re_dfa_t *const dfa = mctx->dfa;
3009             bool ok;
3010             Idx cur_idx;
3011             #ifdef RE_ENABLE_I18N
3012 0           reg_errcode_t err = REG_NOERROR;
3013             #endif
3014             re_node_set union_set;
3015 0           re_node_set_init_empty (&union_set);
3016 0 0         for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3017             {
3018 0           int naccepted = 0;
3019 0           Idx cur_node = cur_nodes->elems[cur_idx];
3020             #ifdef DEBUG
3021             re_token_type_t type = dfa->nodes[cur_node].type;
3022             assert (!IS_EPSILON_NODE (type));
3023             #endif
3024             #ifdef RE_ENABLE_I18N
3025             /* If the node may accept "multi byte". */
3026 0 0         if (dfa->nodes[cur_node].accept_mb)
3027             {
3028 0           naccepted = check_node_accept_bytes (aTHX_ dfa, cur_node, &mctx->input, mctx->sv,
3029             str_idx);
3030 0 0         if (naccepted > 1)
3031             {
3032             re_dfastate_t *dest_state;
3033 0           Idx next_node = dfa->nexts[cur_node];
3034 0           Idx next_idx = str_idx + naccepted;
3035 0           dest_state = mctx->state_log[next_idx];
3036 0           re_node_set_empty (aTHX_ &union_set);
3037 0 0         if (dest_state)
3038             {
3039 0           err = re_node_set_merge (aTHX_ &union_set, &dest_state->nodes);
3040 0 0         if (BE (err != REG_NOERROR, 0))
3041             {
3042 0 0         re_node_set_free (&union_set);
3043 0           return err;
3044             }
3045             }
3046 0           ok = re_node_set_insert (aTHX_ &union_set, next_node);
3047 0 0         if (BE (! ok, 0))
3048             {
3049 0 0         re_node_set_free (&union_set);
3050 0           return REG_ESPACE;
3051             }
3052 0           mctx->state_log[next_idx] = re_acquire_state (aTHX_ &err, dfa,
3053             &union_set);
3054 0 0         if (BE (mctx->state_log[next_idx] == NULL
    0          
3055             && err != REG_NOERROR, 0))
3056             {
3057 0 0         re_node_set_free (&union_set);
3058 0           return err;
3059             }
3060             }
3061             }
3062             #endif /* RE_ENABLE_I18N */
3063 0 0         if (naccepted
3064 0 0         || check_node_accept (aTHX_ mctx, dfa->nodes + cur_node, str_idx))
3065             {
3066 0           ok = re_node_set_insert (aTHX_ next_nodes, dfa->nexts[cur_node]);
3067 0 0         if (BE (! ok, 0))
3068             {
3069 0 0         re_node_set_free (&union_set);
3070 0           return REG_ESPACE;
3071             }
3072             }
3073             }
3074 0 0         re_node_set_free (&union_set);
3075 0           return REG_NOERROR;
3076             }
3077              
3078             /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3079             CUR_NODES, however exclude the nodes which are:
3080             - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3081             - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3082             */
3083              
3084             static reg_errcode_t
3085             internal_function
3086 0           check_arrival_expand_ecl (pTHX_ const re_dfa_t *dfa, re_node_set *cur_nodes,
3087             Idx ex_subexp, int type)
3088             {
3089             reg_errcode_t err;
3090             Idx idx, outside_node;
3091             re_node_set new_nodes;
3092             #ifdef DEBUG
3093             assert (cur_nodes->nelem);
3094             #endif
3095 0           err = re_node_set_alloc (aTHX_ &new_nodes, cur_nodes->nelem);
3096 0 0         if (BE (err != REG_NOERROR, 0))
3097 0           return err;
3098             /* Create a new node set NEW_NODES with the nodes which are epsilon
3099             closures of the node in CUR_NODES. */
3100              
3101 0 0         for (idx = 0; idx < cur_nodes->nelem; ++idx)
3102             {
3103 0           Idx cur_node = cur_nodes->elems[idx];
3104 0           const re_node_set *eclosure = dfa->eclosures + cur_node;
3105 0           outside_node = find_subexp_node (aTHX_ dfa, eclosure, ex_subexp, type);
3106 0 0         if (outside_node == REG_MISSING)
3107             {
3108             /* There are no problematic nodes, just merge them. */
3109 0           err = re_node_set_merge (aTHX_ &new_nodes, eclosure);
3110 0 0         if (BE (err != REG_NOERROR, 0))
3111             {
3112 0 0         re_node_set_free (&new_nodes);
3113 0           return err;
3114             }
3115             }
3116             else
3117             {
3118             /* There are problematic nodes, re-calculate incrementally. */
3119 0           err = check_arrival_expand_ecl_sub (aTHX_ dfa, &new_nodes, cur_node,
3120             ex_subexp, type);
3121 0 0         if (BE (err != REG_NOERROR, 0))
3122             {
3123 0 0         re_node_set_free (&new_nodes);
3124 0           return err;
3125             }
3126             }
3127             }
3128 0 0         re_node_set_free (cur_nodes);
3129 0           *cur_nodes = new_nodes;
3130 0           return REG_NOERROR;
3131             }
3132              
3133             /* Helper function for check_arrival_expand_ecl.
3134             Check incrementally the epsilon closure of TARGET, and if it isn't
3135             problematic append it to DST_NODES. */
3136              
3137             static reg_errcode_t
3138             internal_function __attribute_warn_unused_result__
3139 0           check_arrival_expand_ecl_sub (pTHX_ const re_dfa_t *dfa, re_node_set *dst_nodes,
3140             Idx target, Idx ex_subexp, int type)
3141             {
3142             Idx cur_node;
3143 0 0         for (cur_node = target; !re_node_set_contains (aTHX_ dst_nodes, cur_node);)
3144             {
3145             bool ok;
3146              
3147 0 0         if (dfa->nodes[cur_node].type == type
3148 0 0         && dfa->nodes[cur_node].opr.idx == ex_subexp)
3149             {
3150 0 0         if (type == OP_CLOSE_SUBEXP)
3151             {
3152 0           ok = re_node_set_insert (aTHX_ dst_nodes, cur_node);
3153 0 0         if (BE (! ok, 0))
3154 0           return REG_ESPACE;
3155             }
3156 0           break;
3157             }
3158 0           ok = re_node_set_insert (aTHX_ dst_nodes, cur_node);
3159 0 0         if (BE (! ok, 0))
3160 0           return REG_ESPACE;
3161 0 0         if (dfa->edests[cur_node].nelem == 0)
3162 0           break;
3163 0 0         if (dfa->edests[cur_node].nelem == 2)
3164             {
3165             reg_errcode_t err;
3166 0           err = check_arrival_expand_ecl_sub (aTHX_ dfa, dst_nodes,
3167 0           dfa->edests[cur_node].elems[1],
3168             ex_subexp, type);
3169 0 0         if (BE (err != REG_NOERROR, 0))
3170 0           return err;
3171             }
3172 0           cur_node = dfa->edests[cur_node].elems[0];
3173             }
3174 0           return REG_NOERROR;
3175             }
3176              
3177              
3178             /* For all the back references in the current state, calculate the
3179             destination of the back references by the appropriate entry
3180             in MCTX->BKREF_ENTS. */
3181              
3182             static reg_errcode_t
3183             internal_function __attribute_warn_unused_result__
3184 0           expand_bkref_cache (pTHX_ re_match_context_t *mctx, re_node_set *cur_nodes,
3185             Idx cur_str, Idx subexp_num, int type)
3186             {
3187 0           const re_dfa_t *const dfa = mctx->dfa;
3188             reg_errcode_t err;
3189 0           Idx cache_idx_start = search_cur_bkref_entry (aTHX_ mctx, cur_str);
3190             struct re_backref_cache_entry *ent;
3191              
3192 0 0         if (cache_idx_start == REG_MISSING)
3193 0           return REG_NOERROR;
3194              
3195             restart:
3196 0           ent = mctx->bkref_ents + cache_idx_start;
3197             do
3198             {
3199             Idx to_idx, next_node;
3200              
3201             /* Is this entry ENT is appropriate? */
3202 0 0         if (!re_node_set_contains (aTHX_ cur_nodes, ent->node))
3203 0           continue; /* No. */
3204              
3205 0           to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3206             /* Calculate the destination of the back reference, and append it
3207             to MCTX->STATE_LOG. */
3208 0 0         if (to_idx == cur_str)
3209             {
3210             /* The backreference did epsilon transit, we must re-check all the
3211             node in the current state. */
3212             re_node_set new_dests;
3213             reg_errcode_t err2, err3;
3214 0           next_node = dfa->edests[ent->node].elems[0];
3215 0 0         if (re_node_set_contains (aTHX_ cur_nodes, next_node))
3216 0           continue;
3217 0           err = re_node_set_init_1 (aTHX_ &new_dests, next_node);
3218 0           err2 = check_arrival_expand_ecl (aTHX_ dfa, &new_dests, subexp_num, type);
3219 0           err3 = re_node_set_merge (aTHX_ cur_nodes, &new_dests);
3220 0 0         re_node_set_free (&new_dests);
3221 0 0         if (BE (err != REG_NOERROR || err2 != REG_NOERROR
    0          
    0          
    0          
3222             || err3 != REG_NOERROR, 0))
3223             {
3224 0 0         err = (err != REG_NOERROR ? err
    0          
3225             : (err2 != REG_NOERROR ? err2 : err3));
3226 0           return err;
3227             }
3228             /* TODO: It is still inefficient... */
3229 0           goto restart;
3230             }
3231             else
3232             {
3233             re_node_set union_set;
3234 0           next_node = dfa->nexts[ent->node];
3235 0 0         if (mctx->state_log[to_idx])
3236             {
3237             bool ok;
3238 0 0         if (re_node_set_contains (aTHX_ &mctx->state_log[to_idx]->nodes,
3239             next_node))
3240 0           continue;
3241 0           err = re_node_set_init_copy (aTHX_ &union_set,
3242 0           &mctx->state_log[to_idx]->nodes);
3243 0           ok = re_node_set_insert (aTHX_ &union_set, next_node);
3244 0 0         if (BE (err != REG_NOERROR || ! ok, 0))
    0          
3245             {
3246 0 0         re_node_set_free (&union_set);
3247 0 0         err = err != REG_NOERROR ? err : REG_ESPACE;
3248 0           return err;
3249             }
3250             }
3251             else
3252             {
3253 0           err = re_node_set_init_1 (aTHX_ &union_set, next_node);
3254 0 0         if (BE (err != REG_NOERROR, 0))
3255 0           return err;
3256             }
3257 0           mctx->state_log[to_idx] = re_acquire_state (aTHX_ &err, dfa, &union_set);
3258 0 0         re_node_set_free (&union_set);
3259 0 0         if (BE (mctx->state_log[to_idx] == NULL
    0          
3260             && err != REG_NOERROR, 0))
3261 0           return err;
3262             }
3263             }
3264 0 0         while (ent++->more);
3265 0           return REG_NOERROR;
3266             }
3267              
3268             /* Build transition table for the state.
3269             Return true if successful. */
3270              
3271             static bool
3272             internal_function
3273 19           build_trtable (pTHX_ const re_dfa_t *dfa, re_dfastate_t *state)
3274             {
3275             reg_errcode_t err;
3276             Idx i, j;
3277             int ch;
3278 19           bool need_word_trtable = false;
3279             bitset_word_t elem, mask;
3280 19           bool dests_node_malloced = false;
3281 19           bool dest_states_malloced = false;
3282             Idx ndests; /* Number of the destination states from 'state'. */
3283             re_dfastate_t **trtable;
3284 19           re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3285             re_node_set follows, *dests_node;
3286             bitset_t *dests_ch;
3287             bitset_t acceptable;
3288              
3289             struct dests_alloc
3290             {
3291             re_node_set dests_node[SBC_MAX];
3292             bitset_t dests_ch[SBC_MAX];
3293             } *dests_alloc;
3294              
3295             /* We build DFA states which corresponds to the destination nodes
3296             from 'state'. 'dests_node[i]' represents the nodes which i-th
3297             destination state contains, and 'dests_ch[i]' represents the
3298             characters which i-th destination state accepts. */
3299             if (__libc_use_alloca (sizeof (struct dests_alloc)))
3300             dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3301             else
3302             {
3303 19           re_malloc (dests_alloc, struct dests_alloc, 1);
3304 19           dests_node_malloced = true;
3305             }
3306 19           dests_node = dests_alloc->dests_node;
3307 19           dests_ch = dests_alloc->dests_ch;
3308              
3309             /* Initialize transition table. */
3310 19           state->word_trtable = state->trtable = NULL;
3311              
3312             /* At first, group all nodes belonging to 'state' into several
3313             destinations. */
3314 19           ndests = group_nodes_into_DFAstates (aTHX_ dfa, state, dests_node, dests_ch);
3315 19 100         if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3316             {
3317 2 50         if (dests_node_malloced)
3318 2 50         re_free (dests_alloc);
3319             /* Return false in case of an error, true otherwise. */
3320 2 50         if (ndests == 0)
3321             {
3322 2           re_calloc(state->trtable, re_dfastate_t *, SBC_MAX);
3323 2           return true;
3324             }
3325 0           return false;
3326             }
3327              
3328 17           err = re_node_set_alloc (aTHX_ &follows, ndests + 1);
3329 17 50         if (BE (err != REG_NOERROR, 0))
3330 0           goto out_free;
3331              
3332             /* Avoid arithmetic overflow in size calculation. */
3333 17 50         if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3334             / (3 * sizeof (re_dfastate_t *)))
3335             < ndests),
3336             0))
3337 0           goto out_free;
3338              
3339             if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3340             + ndests * 3 * sizeof (re_dfastate_t *)))
3341             dest_states = (re_dfastate_t **)
3342             alloca (ndests * 3 * sizeof (re_dfastate_t *));
3343             else
3344             {
3345             /*
3346             dest_states = (re_dfastate_t **)
3347             malloc (ndests * 3 * sizeof (re_dfastate_t *));
3348             */
3349 17 50         Newx(dest_states, ndests * 3 * sizeof (re_dfastate_t *), re_dfastate_t *);
3350 17 50         if (BE (dest_states == NULL, 0))
3351             {
3352             out_free:
3353 0 0         if (dest_states_malloced)
3354 0 0         re_free (dest_states);
3355 0 0         re_node_set_free (&follows);
3356 0 0         for (i = 0; i < ndests; ++i)
3357 0 0         re_node_set_free (dests_node + i);
3358 0 0         if (dests_node_malloced)
3359 0 0         re_free (dests_alloc);
3360 0           return false;
3361             }
3362 17           dest_states_malloced = true;
3363             }
3364 17           dest_states_word = dest_states + ndests;
3365 17           dest_states_nl = dest_states_word + ndests;
3366 17           bitset_empty (aTHX_ acceptable);
3367              
3368             /* Then build the states for all destinations. */
3369 34 100         for (i = 0; i < ndests; ++i)
3370             {
3371             Idx next_node;
3372 17           re_node_set_empty (aTHX_ &follows);
3373             /* Merge the follows of this destination states. */
3374 34 100         for (j = 0; j < dests_node[i].nelem; ++j)
3375             {
3376 17           next_node = dfa->nexts[dests_node[i].elems[j]];
3377 17 50         if (next_node != REG_MISSING)
3378             {
3379 17           err = re_node_set_merge (aTHX_ &follows, dfa->eclosures + next_node);
3380 17 50         if (BE (err != REG_NOERROR, 0))
3381 0           goto out_free;
3382             }
3383             }
3384 17           dest_states[i] = re_acquire_state_context (aTHX_ &err, dfa, &follows, 0);
3385 17 50         if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
    0          
3386 0           goto out_free;
3387             /* If the new state has context constraint,
3388             build appropriate states for these contexts. */
3389 17 50         if (dest_states[i]->has_constraint)
3390             {
3391 0           dest_states_word[i] = re_acquire_state_context (aTHX_ &err, dfa, &follows,
3392             CONTEXT_WORD);
3393 0 0         if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
    0          
3394 0           goto out_free;
3395              
3396 0 0         if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
    0          
3397 0           need_word_trtable = true;
3398              
3399 0           dest_states_nl[i] = re_acquire_state_context (aTHX_ &err, dfa, &follows,
3400             CONTEXT_NEWLINE);
3401 0 0         if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
    0          
3402 0           goto out_free;
3403             }
3404             else
3405             {
3406 17           dest_states_word[i] = dest_states[i];
3407 17           dest_states_nl[i] = dest_states[i];
3408             }
3409 17           bitset_merge (aTHX_ acceptable, dests_ch[i]);
3410             }
3411              
3412 17 50         if (!BE (need_word_trtable, 0))
3413             {
3414             /* We don't care about whether the following character is a word
3415             character, or we are in a single-byte character set so we can
3416             discern by looking at the character code: allocate a
3417             256-entry transition table. */
3418 17           re_calloc(state->trtable, re_dfastate_t *, SBC_MAX);
3419 17           trtable = state->trtable;
3420              
3421             /* For all characters ch...: */
3422 85 100         for (i = 0; i < BITSET_WORDS; ++i)
3423 994 100         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3424             elem;
3425 926           mask <<= 1, elem >>= 1, ++ch)
3426 926 100         if (BE (elem & 1, 0))
3427             {
3428             /* There must be exactly one destination which accepts
3429             character ch. See group_nodes_into_DFAstates. */
3430 269 50         for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3431             ;
3432              
3433             /* j-th destination accepts the word character ch. */
3434 269 50         if (dfa->word_char[i] & mask)
3435 0           trtable[ch] = dest_states_word[j];
3436             else
3437 269           trtable[ch] = dest_states[j];
3438             }
3439             }
3440             else
3441             {
3442             /* We care about whether the following character is a word
3443             character, and we are in a multi-byte character set: discern
3444             by looking at the character code: build two 256-entry
3445             transition tables, one starting at trtable[0] and one
3446             starting at trtable[SBC_MAX]. */
3447 0           re_calloc(state->word_trtable, re_dfastate_t *, 2 * SBC_MAX);
3448 0           trtable = state->word_trtable;
3449              
3450             /* For all characters ch...: */
3451 0 0         for (i = 0; i < BITSET_WORDS; ++i)
3452 0 0         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3453             elem;
3454 0           mask <<= 1, elem >>= 1, ++ch)
3455 0 0         if (BE (elem & 1, 0))
3456             {
3457             /* There must be exactly one destination which accepts
3458             character ch. See group_nodes_into_DFAstates. */
3459 0 0         for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3460             ;
3461              
3462             /* j-th destination accepts the word character ch. */
3463 0           trtable[ch] = dest_states[j];
3464 0           trtable[ch + SBC_MAX] = dest_states_word[j];
3465             }
3466             }
3467              
3468             /* new line */
3469 17 100         if (bitset_contain (aTHX_ acceptable, NEWLINE_CHAR))
3470             {
3471             /* The current state accepts newline character. */
3472 2 50         for (j = 0; j < ndests; ++j)
3473 2 50         if (bitset_contain (aTHX_ dests_ch[j], NEWLINE_CHAR))
3474             {
3475             /* k-th destination accepts newline character. */
3476 2           trtable[NEWLINE_CHAR] = dest_states_nl[j];
3477 2 50         if (need_word_trtable)
3478 0           trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3479             /* There must be only one destination which accepts
3480             newline. See group_nodes_into_DFAstates. */
3481 2           break;
3482             }
3483             }
3484              
3485 17 50         if (dest_states_malloced)
3486 17 50         re_free (dest_states);
3487              
3488 17 50         re_node_set_free (&follows);
3489 34 100         for (i = 0; i < ndests; ++i)
3490 17 50         re_node_set_free (dests_node + i);
3491              
3492 17 50         if (dests_node_malloced)
3493 17 50         re_free (dests_alloc);
3494              
3495 19           return true;
3496             }
3497              
3498             /* Group all nodes belonging to STATE into several destinations.
3499             Then for all destinations, set the nodes belonging to the destination
3500             to DESTS_NODE[i] and set the characters accepted by the destination
3501             to DEST_CH[i]. This function return the number of destinations. */
3502              
3503             static Idx
3504             internal_function
3505 19           group_nodes_into_DFAstates (pTHX_ const re_dfa_t *dfa, const re_dfastate_t *state,
3506             re_node_set *dests_node, bitset_t *dests_ch)
3507             {
3508             reg_errcode_t err;
3509             bool ok;
3510             Idx i, j, k;
3511             Idx ndests; /* Number of the destinations from 'state'. */
3512             bitset_t accepts; /* Characters a node can accept. */
3513 19           const re_node_set *cur_nodes = &state->nodes;
3514 19           bitset_empty (aTHX_ accepts);
3515 19           ndests = 0;
3516              
3517             /* For all the nodes belonging to 'state', */
3518 54 100         for (i = 0; i < cur_nodes->nelem; ++i)
3519             {
3520 35           re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3521 35           re_token_type_t type = node->type;
3522 35           unsigned int constraint = node->constraint;
3523              
3524             /* Enumerate all single byte character this node can accept. */
3525 35 100         if (type == CHARACTER)
3526 15           bitset_set (aTHX_ accepts, node->opr.c);
3527 20 100         else if (type == SIMPLE_BRACKET)
3528             {
3529 2           bitset_merge (aTHX_ accepts, node->opr.sbcset);
3530             }
3531 18 50         else if (type == OP_PERIOD)
3532             {
3533             #ifdef RE_ENABLE_I18N
3534 0 0         if (dfa->mb_cur_max > 1)
3535 0           bitset_merge (aTHX_ accepts, dfa->sb_char);
3536             else
3537             #endif
3538 0           bitset_set_all (aTHX_ accepts);
3539 0 0         if (!(dfa->syntax & RE_DOT_NEWLINE))
3540 0           bitset_clear (accepts, '\n');
3541 0 0         if (dfa->syntax & RE_DOT_NOT_NULL)
3542 0           bitset_clear (accepts, '\0');
3543             }
3544             #ifdef RE_ENABLE_I18N
3545 18 50         else if (type == OP_UTF8_PERIOD)
3546             {
3547             if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3548 0           memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3549             else
3550             bitset_merge (aTHX_ accepts, utf8_sb_map);
3551 0 0         if (!(dfa->syntax & RE_DOT_NEWLINE))
3552 0           bitset_clear (accepts, '\n');
3553 0 0         if (dfa->syntax & RE_DOT_NOT_NULL)
3554 0           bitset_clear (accepts, '\0');
3555             }
3556             #endif
3557             else
3558 18           continue;
3559              
3560             /* Check the 'accepts' and sift the characters which are not
3561             match it the context. */
3562 17 50         if (constraint)
3563             {
3564 0 0         if (constraint & NEXT_NEWLINE_CONSTRAINT)
3565             {
3566 0           bool accepts_newline = bitset_contain (aTHX_ accepts, NEWLINE_CHAR);
3567 0           bitset_empty (aTHX_ accepts);
3568 0 0         if (accepts_newline)
3569 0           bitset_set (aTHX_ accepts, NEWLINE_CHAR);
3570             else
3571 0           continue;
3572             }
3573 0 0         if (constraint & NEXT_ENDBUF_CONSTRAINT)
3574             {
3575 0           bitset_empty (aTHX_ accepts);
3576 0           continue;
3577             }
3578              
3579 0 0         if (constraint & NEXT_WORD_CONSTRAINT)
3580             {
3581 0           bitset_word_t any_set = 0;
3582 0 0         if (type == CHARACTER && !node->word_char)
    0          
3583             {
3584 0           bitset_empty (aTHX_ accepts);
3585 0           continue;
3586             }
3587             #ifdef RE_ENABLE_I18N
3588 0 0         if (dfa->mb_cur_max > 1)
3589 0 0         for (j = 0; j < BITSET_WORDS; ++j)
3590 0           any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3591             else
3592             #endif
3593 0 0         for (j = 0; j < BITSET_WORDS; ++j)
3594 0           any_set |= (accepts[j] &= dfa->word_char[j]);
3595 0 0         if (!any_set)
3596 0           continue;
3597             }
3598 0 0         if (constraint & NEXT_NOTWORD_CONSTRAINT)
3599             {
3600 0           bitset_word_t any_set = 0;
3601 0 0         if (type == CHARACTER && node->word_char)
    0          
3602             {
3603 0           bitset_empty (aTHX_ accepts);
3604 0           continue;
3605             }
3606             #ifdef RE_ENABLE_I18N
3607 0 0         if (dfa->mb_cur_max > 1)
3608 0 0         for (j = 0; j < BITSET_WORDS; ++j)
3609 0           any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3610             else
3611             #endif
3612 0 0         for (j = 0; j < BITSET_WORDS; ++j)
3613 0           any_set |= (accepts[j] &= ~dfa->word_char[j]);
3614 0 0         if (!any_set)
3615 0           continue;
3616             }
3617             }
3618              
3619             /* Then divide 'accepts' into DFA states, or create a new
3620             state. Above, we make sure that accepts is not empty. */
3621 17 50         for (j = 0; j < ndests; ++j)
3622             {
3623             bitset_t intersec; /* Intersection sets, see below. */
3624             bitset_t remains;
3625             /* Flags, see below. */
3626             bitset_word_t has_intersec, not_subset, not_consumed;
3627              
3628             /* Optimization, skip if this state doesn't accept the character. */
3629 0 0         if (type == CHARACTER && !bitset_contain (aTHX_ dests_ch[j], node->opr.c))
    0          
3630 0           continue;
3631              
3632             /* Enumerate the intersection set of this state and 'accepts'. */
3633 0           has_intersec = 0;
3634 0 0         for (k = 0; k < BITSET_WORDS; ++k)
3635 0           has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3636             /* And skip if the intersection set is empty. */
3637 0 0         if (!has_intersec)
3638 0           continue;
3639              
3640             /* Then check if this state is a subset of 'accepts'. */
3641 0           not_subset = not_consumed = 0;
3642 0 0         for (k = 0; k < BITSET_WORDS; ++k)
3643             {
3644 0           not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3645 0           not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3646             }
3647              
3648             /* If this state isn't a subset of 'accepts', create a
3649             new group state, which has the 'remains'. */
3650 0 0         if (not_subset)
3651             {
3652 0           bitset_copy (aTHX_ dests_ch[ndests], remains);
3653 0           bitset_copy (aTHX_ dests_ch[j], intersec);
3654 0           err = re_node_set_init_copy (aTHX_ dests_node + ndests, &dests_node[j]);
3655 0 0         if (BE (err != REG_NOERROR, 0))
3656 0           goto error_return;
3657 0           ++ndests;
3658             }
3659              
3660             /* Put the position in the current group. */
3661 0           ok = re_node_set_insert (aTHX_ &dests_node[j], cur_nodes->elems[i]);
3662 0 0         if (BE (! ok, 0))
3663 0           goto error_return;
3664              
3665             /* If all characters are consumed, go to next node. */
3666 0 0         if (!not_consumed)
3667 0           break;
3668             }
3669             /* Some characters remain, create a new group. */
3670 17 50         if (j == ndests)
3671             {
3672 17           bitset_copy (aTHX_ dests_ch[ndests], accepts);
3673 17           err = re_node_set_init_1 (aTHX_ dests_node + ndests, cur_nodes->elems[i]);
3674 17 50         if (BE (err != REG_NOERROR, 0))
3675 0           goto error_return;
3676 17           ++ndests;
3677 17           bitset_empty (aTHX_ accepts);
3678             }
3679             }
3680 19           return ndests;
3681             error_return:
3682 0 0         for (j = 0; j < ndests; ++j)
3683 0 0         re_node_set_free (dests_node + j);
3684 19           return REG_MISSING;
3685             }
3686              
3687             #ifdef RE_ENABLE_I18N
3688             /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3689             Return the number of the bytes the node accepts.
3690             STR_IDX is the current index of the input string.
3691              
3692             This function handles the nodes which can accept one character, or
3693             one collating element like '.', '[a-z]', opposite to the other nodes
3694             can only accept one byte. */
3695              
3696             static int
3697             internal_function
3698 35           check_node_accept_bytes (pTHX_ const re_dfa_t *dfa, Idx node_idx,
3699             const re_string_t *input, SV *sv, Idx str_idx)
3700             {
3701 35           const re_token_t *node = dfa->nodes + node_idx;
3702             int char_len, elem_len;
3703             Idx i;
3704              
3705 35 50         if (BE (node->type == OP_UTF8_PERIOD, 0))
3706             {
3707 0           unsigned char c = re_string_byte_at (input, str_idx), d;
3708 0 0         if (BE (c < 0xc2, 1))
3709 0           return 0;
3710              
3711 0 0         if (str_idx + 2 > input->len)
3712 0           return 0;
3713              
3714 0           d = re_string_byte_at (input, str_idx + 1);
3715 0 0         if (c < 0xe0)
3716 0 0         return (d < 0x80 || d > 0xbf) ? 0 : 2;
    0          
3717 0 0         else if (c < 0xf0)
3718             {
3719 0           char_len = 3;
3720 0 0         if (c == 0xe0 && d < 0xa0)
    0          
3721 0           return 0;
3722             }
3723 0 0         else if (c < 0xf8)
3724             {
3725 0           char_len = 4;
3726 0 0         if (c == 0xf0 && d < 0x90)
    0          
3727 0           return 0;
3728             }
3729 0 0         else if (c < 0xfc)
3730             {
3731 0           char_len = 5;
3732 0 0         if (c == 0xf8 && d < 0x88)
    0          
3733 0           return 0;
3734             }
3735 0 0         else if (c < 0xfe)
3736             {
3737 0           char_len = 6;
3738 0 0         if (c == 0xfc && d < 0x84)
    0          
3739 0           return 0;
3740             }
3741             else
3742 0           return 0;
3743              
3744 0 0         if (str_idx + char_len > input->len)
3745 0           return 0;
3746              
3747 0 0         for (i = 1; i < char_len; ++i)
3748             {
3749 0           d = re_string_byte_at (input, str_idx + i);
3750 0 0         if (d < 0x80 || d > 0xbf)
    0          
3751 0           return 0;
3752             }
3753 0           return char_len;
3754             }
3755              
3756 35           char_len = re_string_char_size_at (aTHX_ input, str_idx);
3757 35 50         if (node->type == OP_PERIOD)
3758             {
3759 0 0         if (char_len <= 1)
3760 0           return 0;
3761             /* FIXME: I don't think this if is needed, as both '\n'
3762             and '\0' are char_len == 1. */
3763             /* '.' accepts any one character except the following two cases. */
3764 0 0         if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
    0          
3765 0 0         re_string_byte_at (input, str_idx) == '\n') ||
3766 0 0         ((dfa->syntax & RE_DOT_NOT_NULL) &&
3767 0           re_string_byte_at (input, str_idx) == '\0'))
3768 0           return 0;
3769 0           return char_len;
3770             }
3771              
3772 35           elem_len = re_string_elem_size_at (aTHX_ input, sv, str_idx);
3773 35 100         if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
    50          
    50          
3774 1           return 0;
3775              
3776 34 50         if (node->type == COMPLEX_BRACKET)
3777             {
3778 34           const re_charset_t *cset = node->opr.mbcset;
3779             # ifdef _LIBC
3780             const unsigned char *pin
3781             = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3782             Idx j;
3783             uint32_t nrules;
3784             # endif /* _LIBC */
3785 34           int match_len = 0;
3786 34 50         wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
    50          
    50          
3787 0           ? re_string_wchar_at (aTHX_ input, str_idx) : 0);
3788              
3789             /* match with multibyte character? */
3790 34 50         for (i = 0; i < cset->nmbchars; ++i)
3791 0 0         if (wc == cset->mbchars[i])
3792             {
3793 0           match_len = char_len;
3794 0           goto check_node_accept_bytes_match;
3795             }
3796             /* match with character_class? */
3797 34 50         for (i = 0; i < cset->nchar_classes; ++i)
3798             {
3799 0           rpl__wctype_t wt = cset->char_classes[i];
3800 0 0         if (rpl__iswctype (wc, wt))
3801             {
3802 0           match_len = char_len;
3803 0           goto check_node_accept_bytes_match;
3804             }
3805             }
3806              
3807             # ifdef _LIBC
3808             nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3809             if (nrules != 0)
3810             {
3811             unsigned int in_collseq = 0;
3812             const int32_t *table, *indirect;
3813             const unsigned char *weights, *extra;
3814             const char *collseqwc;
3815             /* This #include defines a local function! */
3816             # include
3817              
3818             /* match with collating_symbol? */
3819             if (cset->ncoll_syms)
3820             extra = (const unsigned char *)
3821             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3822             for (i = 0; i < cset->ncoll_syms; ++i)
3823             {
3824             const unsigned char *coll_sym = extra + cset->coll_syms[i];
3825             /* Compare the length of input collating element and
3826             the length of current collating element. */
3827             if (*coll_sym != elem_len)
3828             continue;
3829             /* Compare each bytes. */
3830             for (j = 0; j < *coll_sym; j++)
3831             if (pin[j] != coll_sym[1 + j])
3832             break;
3833             if (j == *coll_sym)
3834             {
3835             /* Match if every bytes is equal. */
3836             match_len = j;
3837             goto check_node_accept_bytes_match;
3838             }
3839             }
3840              
3841             if (cset->nranges)
3842             {
3843             if (elem_len <= char_len)
3844             {
3845             collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3846             in_collseq = __collseq_table_lookup (collseqwc, wc);
3847             }
3848             else
3849             in_collseq = find_collation_sequence_value (pin, elem_len);
3850             }
3851             /* match with range expression? */
3852             /* FIXME: Implement rational ranges here, too. */
3853             for (i = 0; i < cset->nranges; ++i)
3854             if (cset->range_starts[i] <= in_collseq
3855             && in_collseq <= cset->range_ends[i])
3856             {
3857             match_len = elem_len;
3858             goto check_node_accept_bytes_match;
3859             }
3860              
3861             /* match with equivalence_class? */
3862             if (cset->nequiv_classes)
3863             {
3864             const unsigned char *cp = pin;
3865             table = (const int32_t *)
3866             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3867             weights = (const unsigned char *)
3868             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3869             extra = (const unsigned char *)
3870             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3871             indirect = (const int32_t *)
3872             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3873             int32_t idx = findidx (&cp, elem_len);
3874             if (idx > 0)
3875             for (i = 0; i < cset->nequiv_classes; ++i)
3876             {
3877             int32_t equiv_class_idx = cset->equiv_classes[i];
3878             size_t weight_len = weights[idx & 0xffffff];
3879             if (weight_len == weights[equiv_class_idx & 0xffffff]
3880             && (idx >> 24) == (equiv_class_idx >> 24))
3881             {
3882             Idx cnt = 0;
3883              
3884             idx &= 0xffffff;
3885             equiv_class_idx &= 0xffffff;
3886              
3887             while (cnt <= weight_len
3888             && (weights[equiv_class_idx + 1 + cnt]
3889             == weights[idx + 1 + cnt]))
3890             ++cnt;
3891             if (cnt > weight_len)
3892             {
3893             match_len = elem_len;
3894             goto check_node_accept_bytes_match;
3895             }
3896             }
3897             }
3898             }
3899             }
3900             else
3901             # endif /* _LIBC */
3902             {
3903             /* match with range expression? */
3904 34 50         for (i = 0; i < cset->nranges; ++i)
3905             {
3906 0 0         if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
    0          
3907             {
3908 0           match_len = char_len;
3909 0           goto check_node_accept_bytes_match;
3910             }
3911             }
3912             }
3913             check_node_accept_bytes_match:
3914 34 50         if (!cset->non_match)
3915 0           return match_len;
3916             else
3917             {
3918 34 50         if (match_len > 0)
3919 0           return 0;
3920             else
3921 34           return (elem_len > char_len) ? elem_len : char_len;
3922             }
3923             }
3924 0           return 0;
3925             }
3926              
3927             # ifdef _LIBC
3928             static unsigned int
3929             internal_function
3930             find_collation_sequence_value (pTHX_ const unsigned char *mbs, size_t mbs_len)
3931             {
3932             uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3933             if (nrules == 0)
3934             {
3935             if (mbs_len == 1)
3936             {
3937             /* No valid character. Match it as a single byte character. */
3938             const unsigned char *collseq = (const unsigned char *)
3939             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3940             return collseq[mbs[0]];
3941             }
3942             return UINT_MAX;
3943             }
3944             else
3945             {
3946             int32_t idx;
3947             const unsigned char *extra = (const unsigned char *)
3948             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3949             int32_t extrasize = (const unsigned char *)
3950             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3951              
3952             for (idx = 0; idx < extrasize;)
3953             {
3954             int mbs_cnt;
3955             bool found = false;
3956             int32_t elem_mbs_len;
3957             /* Skip the name of collating element name. */
3958             idx = idx + extra[idx] + 1;
3959             elem_mbs_len = extra[idx++];
3960             if (mbs_len == elem_mbs_len)
3961             {
3962             for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3963             if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3964             break;
3965             if (mbs_cnt == elem_mbs_len)
3966             /* Found the entry. */
3967             found = true;
3968             }
3969             /* Skip the byte sequence of the collating element. */
3970             idx += elem_mbs_len;
3971             /* Adjust for the alignment. */
3972             idx = (idx + 3) & ~3;
3973             /* Skip the collation sequence value. */
3974             idx += sizeof (uint32_t);
3975             /* Skip the wide char sequence of the collating element. */
3976             idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3977             /* If we found the entry, return the sequence value. */
3978             if (found)
3979             return *(uint32_t *) (extra + idx);
3980             /* Skip the collation sequence value. */
3981             idx += sizeof (uint32_t);
3982             }
3983             return UINT_MAX;
3984             }
3985             }
3986             # endif /* _LIBC */
3987             #endif /* RE_ENABLE_I18N */
3988              
3989             /* Check whether the node accepts the byte which is IDX-th
3990             byte of the INPUT. */
3991              
3992             static bool
3993             internal_function
3994 23           check_node_accept (pTHX_ const re_match_context_t *mctx, const re_token_t *node,
3995             Idx idx)
3996             {
3997             unsigned char ch;
3998 23           ch = re_string_byte_at (&mctx->input, idx);
3999 23           switch (node->type)
4000             {
4001             case CHARACTER:
4002 14 50         if (node->opr.c != ch)
4003 0           return false;
4004 14           break;
4005              
4006             case SIMPLE_BRACKET:
4007 9 50         if (!bitset_contain (aTHX_ node->opr.sbcset, ch))
4008 9           return false;
4009 0           break;
4010              
4011             #ifdef RE_ENABLE_I18N
4012             case OP_UTF8_PERIOD:
4013 0 0         if (ch >= ASCII_CHARS)
4014 0           return false;
4015             /* FALLTHROUGH */
4016             #endif
4017             case OP_PERIOD:
4018 0 0         if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
    0          
4019 0 0         || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
    0          
4020 0           return false;
4021 0           break;
4022              
4023             default:
4024 0           return false;
4025             }
4026              
4027 14 50         if (node->constraint)
4028             {
4029             /* The node has constraints. Check whether the current context
4030             satisfies the constraints. */
4031 0           unsigned int context = re_string_context_at (aTHX_ &mctx->input, idx,
4032             mctx->eflags);
4033 0 0         if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
    0          
    0          
    0          
    0          
    0          
    0          
    0          
4034 0           return false;
4035             }
4036              
4037 14           return true;
4038             }
4039              
4040             /* Extend the buffers, if the buffers have run out. */
4041              
4042             static reg_errcode_t
4043             internal_function __attribute_warn_unused_result__
4044 0           extend_buffers (pTHX_ re_match_context_t *mctx, Idx min_len)
4045             {
4046             reg_errcode_t ret;
4047 0           re_string_t *pstr = &mctx->input;
4048              
4049             /* Avoid overflow. */
4050 0 0         if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4051             <= pstr->bufs_len, 0))
4052 0           return REG_ESPACE;
4053              
4054             /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4055 0           ret = re_string_realloc_buffers (aTHX_ pstr,
4056 0           MAX (min_len,
4057             MIN (pstr->len, pstr->bufs_len * 2)));
4058 0 0         if (BE (ret != REG_NOERROR, 0))
4059 0           return ret;
4060              
4061 0 0         if (mctx->state_log != NULL)
4062             {
4063             /* And double the length of state_log. */
4064             /* XXX We have no indication of the size of this buffer. If this
4065             allocation fail we have no indication that the state_log array
4066             does not have the right size. */
4067 0 0         re_realloc (mctx->state_log, re_dfastate_t *, pstr->bufs_len + 1);
4068             }
4069              
4070             /* Then reconstruct the buffers. */
4071 0 0         if (pstr->icase)
4072             {
4073             #ifdef RE_ENABLE_I18N
4074 0 0         if (pstr->mb_cur_max > 1)
4075             {
4076 0           ret = build_wcs_upper_buffer (aTHX_ pstr);
4077 0 0         if (BE (ret != REG_NOERROR, 0))
4078 0           return ret;
4079             }
4080             else
4081             #endif /* RE_ENABLE_I18N */
4082 0           build_upper_buffer (aTHX_ pstr);
4083             }
4084             else
4085             {
4086             #ifdef RE_ENABLE_I18N
4087 0 0         if (pstr->mb_cur_max > 1)
4088 0           build_wcs_buffer (aTHX_ pstr);
4089             else
4090             #endif /* RE_ENABLE_I18N */
4091             {
4092 0 0         if (pstr->trans != NULL)
4093 0           re_string_translate_buffer (aTHX_ pstr);
4094             }
4095             }
4096 0           return REG_NOERROR;
4097             }
4098              
4099            
4100             /* Functions for matching context. */
4101              
4102             /* Initialize MCTX. */
4103              
4104             static reg_errcode_t
4105             internal_function __attribute_warn_unused_result__
4106 14           match_ctx_init (pTHX_ re_match_context_t *mctx, int eflags, Idx n)
4107             {
4108 14           mctx->eflags = eflags;
4109 14           mctx->match_last = REG_MISSING;
4110 14 50         if (n > 0)
4111             {
4112             /* Avoid overflow. */
4113 0           size_t max_object_size =
4114             MAX (sizeof (struct re_backref_cache_entry),
4115             sizeof (re_sub_match_top_t *));
4116 0 0         if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4117 0           return REG_ESPACE;
4118              
4119 0 0         re_malloc (mctx->bkref_ents, struct re_backref_cache_entry, n);
4120 0 0         re_malloc (mctx->sub_tops, re_sub_match_top_t *, n);
4121             }
4122             /* Already zero-ed by the caller.
4123             else
4124             mctx->bkref_ents = NULL;
4125             mctx->nbkref_ents = 0;
4126             mctx->nsub_tops = 0; */
4127 14           mctx->abkref_ents = n;
4128 14           mctx->max_mb_elem_len = 1;
4129 14           mctx->asub_tops = n;
4130 14           return REG_NOERROR;
4131             }
4132              
4133             /* Clean the entries which depend on the current input in MCTX.
4134             This function must be invoked when the matcher changes the start index
4135             of the input, or changes the input string. */
4136              
4137             static void
4138             internal_function
4139 23           match_ctx_clean (pTHX_ re_match_context_t *mctx)
4140             {
4141             Idx st_idx;
4142 23 50         for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4143             {
4144             Idx sl_idx;
4145 0           re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4146 0 0         for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4147             {
4148 0           re_sub_match_last_t *last = top->lasts[sl_idx];
4149 0 0         re_free (last->path.array);
4150 0 0         re_free (last);
4151             }
4152 0 0         re_free (top->lasts);
4153 0 0         if (top->path)
4154             {
4155 0 0         re_free (top->path->array);
4156 0 0         re_free (top->path);
4157             }
4158 0 0         re_free (top);
4159             }
4160              
4161 23           mctx->nsub_tops = 0;
4162 23           mctx->nbkref_ents = 0;
4163 23           }
4164              
4165             /* Free all the memory associated with MCTX. */
4166              
4167             static void
4168             internal_function
4169 0           match_ctx_free (pTHX_ re_match_context_t *mctx)
4170             {
4171             /* First, free all the memory associated with MCTX->SUB_TOPS. */
4172 0           match_ctx_clean (aTHX_ mctx);
4173 0 0         re_free (mctx->sub_tops);
4174 0 0         re_free (mctx->bkref_ents);
4175 0           }
4176              
4177             /* Add a new backreference entry to MCTX.
4178             Note that we assume that caller never call this function with duplicate
4179             entry, and call with STR_IDX which isn't smaller than any existing entry.
4180             */
4181              
4182             static reg_errcode_t
4183             internal_function __attribute_warn_unused_result__
4184 0           match_ctx_add_entry (pTHX_ re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4185             Idx to)
4186             {
4187 0 0         if (mctx->nbkref_ents >= mctx->abkref_ents)
4188             {
4189 0 0         re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, mctx->abkref_ents * 2);
4190 0           memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4191 0           sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4192 0           mctx->abkref_ents *= 2;
4193             }
4194 0 0         if (mctx->nbkref_ents > 0
4195 0 0         && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4196 0           mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4197              
4198 0           mctx->bkref_ents[mctx->nbkref_ents].node = node;
4199 0           mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4200 0           mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4201 0           mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4202              
4203             /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4204             If bit N is clear, means that this entry won't epsilon-transition to
4205             an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4206             it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4207             such node.
4208              
4209             A backreference does not epsilon-transition unless it is empty, so set
4210             to all zeros if FROM != TO. */
4211 0           mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4212 0 0         = (from == to ? -1 : 0);
4213              
4214 0           mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4215 0 0         if (mctx->max_mb_elem_len < to - from)
4216 0           mctx->max_mb_elem_len = to - from;
4217 0           return REG_NOERROR;
4218             }
4219              
4220             /* Return the first entry with the same str_idx, or REG_MISSING if none is
4221             found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4222              
4223             static Idx
4224             internal_function
4225 0           search_cur_bkref_entry (pTHX_ const re_match_context_t *mctx, Idx str_idx)
4226             {
4227             Idx left, right, mid, last;
4228 0           last = right = mctx->nbkref_ents;
4229 0 0         for (left = 0; left < right;)
4230             {
4231 0           mid = (left + right) / 2;
4232 0 0         if (mctx->bkref_ents[mid].str_idx < str_idx)
4233 0           left = mid + 1;
4234             else
4235 0           right = mid;
4236             }
4237 0 0         if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
    0          
4238 0           return left;
4239             else
4240 0           return REG_MISSING;
4241             }
4242              
4243             /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4244             at STR_IDX. */
4245              
4246             static reg_errcode_t
4247             internal_function __attribute_warn_unused_result__
4248 0           match_ctx_add_subtop (pTHX_ re_match_context_t *mctx, Idx node, Idx str_idx)
4249             {
4250             #ifdef DEBUG
4251             assert (mctx->sub_tops != NULL);
4252             assert (mctx->asub_tops > 0);
4253             #endif
4254 0 0         if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4255             {
4256 0           Idx new_asub_tops = mctx->asub_tops * 2;
4257 0 0         re_realloc (mctx->sub_tops, re_sub_match_top_t *, new_asub_tops);
4258 0           mctx->asub_tops = new_asub_tops;
4259             }
4260 0           re_calloc(mctx->sub_tops[mctx->nsub_tops], re_sub_match_top_t, 1);
4261 0           mctx->sub_tops[mctx->nsub_tops]->node = node;
4262 0           mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4263 0           return REG_NOERROR;
4264             }
4265              
4266             /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4267             at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4268              
4269             static re_sub_match_last_t *
4270             internal_function
4271 0           match_ctx_add_sublast (pTHX_ re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4272             {
4273             re_sub_match_last_t *new_entry;
4274 0 0         if (BE (subtop->nlasts == subtop->alasts, 0))
4275             {
4276 0           Idx new_alasts = 2 * subtop->alasts + 1;
4277 0 0         re_realloc (subtop->lasts, re_sub_match_last_t *, new_alasts);
4278 0           subtop->alasts = new_alasts;
4279             }
4280 0           re_calloc(new_entry, re_sub_match_last_t, 1);
4281 0           subtop->lasts[subtop->nlasts] = new_entry;
4282 0           new_entry->node = node;
4283 0           new_entry->str_idx = str_idx;
4284 0           ++subtop->nlasts;
4285 0           return new_entry;
4286             }
4287              
4288             static void
4289             internal_function
4290 9           sift_ctx_init (pTHX_ re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4291             re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4292             {
4293 9           sctx->sifted_states = sifted_sts;
4294 9           sctx->limited_states = limited_sts;
4295 9           sctx->last_node = last_node;
4296 9           sctx->last_str_idx = last_str_idx;
4297 9           re_node_set_init_empty (&sctx->limits);
4298 9           }