1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002-2011 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License along
19 with this program; if not, write to the Free Software Foundation,
20 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
23 Idx n) internal_function;
24 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
25 static void match_ctx_free (re_match_context_t *cache) internal_function;
26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
27 Idx str_idx, Idx from, Idx to)
29 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
31 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
32 Idx str_idx) internal_function;
33 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
34 Idx node, Idx str_idx)
36 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
37 re_dfastate_t **limited_sts, Idx last_node,
40 static reg_errcode_t re_search_internal (const regex_t *preg,
41 const char *string, Idx length,
42 Idx start, Idx last_start, Idx stop,
43 size_t nmatch, regmatch_t pmatch[],
44 int eflags) internal_function;
45 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
46 const char *string1, Idx length1,
47 const char *string2, Idx length2,
48 Idx start, regoff_t range,
49 struct re_registers *regs,
50 Idx stop, bool ret_len) internal_function;
51 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
52 const char *string, Idx length, Idx start,
53 regoff_t range, Idx stop,
54 struct re_registers *regs,
55 bool ret_len) internal_function;
56 static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
57 Idx nregs, int regs_allocated)
59 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
61 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
62 Idx *p_match_first) internal_function;
63 static Idx check_halt_state_context (const re_match_context_t *mctx,
64 const re_dfastate_t *state, Idx idx)
66 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
67 regmatch_t *prev_idx_match, Idx cur_node,
68 Idx cur_idx, Idx nmatch) internal_function;
69 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
70 Idx str_idx, Idx dest_node, Idx nregs,
72 re_node_set *eps_via_nodes)
74 static reg_errcode_t set_regs (const regex_t *preg,
75 const re_match_context_t *mctx,
76 size_t nmatch, regmatch_t *pmatch,
77 bool fl_backtrack) internal_function;
78 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
82 static int sift_states_iter_mb (const re_match_context_t *mctx,
83 re_sift_context_t *sctx,
84 Idx node_idx, Idx str_idx, Idx max_str_idx)
86 #endif /* RE_ENABLE_I18N */
87 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
88 re_sift_context_t *sctx)
90 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
91 re_sift_context_t *sctx, Idx str_idx,
92 re_node_set *cur_dest)
94 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
95 re_sift_context_t *sctx,
97 re_node_set *dest_nodes)
99 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
100 re_node_set *dest_nodes,
101 const re_node_set *candidates)
103 static bool check_dst_limits (const re_match_context_t *mctx,
104 const re_node_set *limits,
105 Idx dst_node, Idx dst_idx, Idx src_node,
106 Idx src_idx) internal_function;
107 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
108 int boundaries, Idx subexp_idx,
109 Idx from_node, Idx bkref_idx)
111 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
112 Idx limit, Idx subexp_idx,
113 Idx node, Idx str_idx,
114 Idx bkref_idx) internal_function;
115 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
116 re_node_set *dest_nodes,
117 const re_node_set *candidates,
119 struct re_backref_cache_entry *bkref_ents,
120 Idx str_idx) internal_function;
121 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
122 re_sift_context_t *sctx,
123 Idx str_idx, const re_node_set *candidates)
125 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
127 re_dfastate_t **src, Idx num)
129 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
130 re_match_context_t *mctx) internal_function;
131 static re_dfastate_t *transit_state (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *state) internal_function;
134 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
135 re_match_context_t *mctx,
136 re_dfastate_t *next_state)
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 Idx str_idx) internal_function;
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate)
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
149 re_dfastate_t *pstate)
151 #endif /* RE_ENABLE_I18N */
152 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
153 const re_node_set *nodes)
155 static reg_errcode_t get_subexp (re_match_context_t *mctx,
156 Idx bkref_node, Idx bkref_str_idx)
158 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
159 const re_sub_match_top_t *sub_top,
160 re_sub_match_last_t *sub_last,
161 Idx bkref_node, Idx bkref_str)
163 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
164 Idx subexp_idx, int type) internal_function;
165 static reg_errcode_t check_arrival (re_match_context_t *mctx,
166 state_array_t *path, Idx top_node,
167 Idx top_str, Idx last_node, Idx last_str,
168 int type) internal_function;
169 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
171 re_node_set *cur_nodes,
172 re_node_set *next_nodes)
174 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
175 re_node_set *cur_nodes,
176 Idx ex_subexp, int type)
178 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
179 re_node_set *dst_nodes,
180 Idx target, Idx ex_subexp,
181 int type) internal_function;
182 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
183 re_node_set *cur_nodes, Idx cur_str,
184 Idx subexp_num, int type)
186 static bool build_trtable (const re_dfa_t *dfa,
187 re_dfastate_t *state) internal_function;
188 #ifdef RE_ENABLE_I18N
189 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
190 const re_string_t *input, Idx idx)
193 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
197 #endif /* RE_ENABLE_I18N */
198 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
199 const re_dfastate_t *state,
200 re_node_set *states_node,
201 bitset_t *states_ch) internal_function;
202 static bool check_node_accept (const re_match_context_t *mctx,
203 const re_token_t *node, Idx idx)
205 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
208 /* Entry point for POSIX code. */
210 /* regexec searches for a given pattern, specified by PREG, in the
213 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
214 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
215 least NMATCH elements, and we set them to the offsets of the
216 corresponding matched substrings.
218 EFLAGS specifies `execution flags' which affect matching: if
219 REG_NOTBOL is set, then ^ does not match at the beginning of the
220 string; if REG_NOTEOL is set, then $ does not match at the end.
222 We return 0 if we find a match and REG_NOMATCH if not. */
225 regexec (preg, string, nmatch, pmatch, eflags)
226 const regex_t *_Restrict_ preg;
227 const char *_Restrict_ string;
229 regmatch_t pmatch[_Restrict_arr_];
235 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
238 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
241 if (eflags & REG_STARTEND)
243 start = pmatch[0].rm_so;
244 length = pmatch[0].rm_eo;
249 length = strlen (string);
252 __libc_lock_lock (dfa->lock);
254 err = re_search_internal (preg, string, length, start, length,
255 length, 0, NULL, eflags);
257 err = re_search_internal (preg, string, length, start, length,
258 length, nmatch, pmatch, eflags);
259 __libc_lock_unlock (dfa->lock);
260 return err != REG_NOERROR;
264 # include <shlib-compat.h>
265 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
267 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
268 __typeof__ (__regexec) __compat_regexec;
271 attribute_compat_text_section
272 __compat_regexec (const regex_t *_Restrict_ preg,
273 const char *_Restrict_ string, size_t nmatch,
274 regmatch_t pmatch[], int eflags)
276 return regexec (preg, string, nmatch, pmatch,
277 eflags & (REG_NOTBOL | REG_NOTEOL));
279 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
283 /* Entry points for GNU code. */
285 /* re_match, re_search, re_match_2, re_search_2
287 The former two functions operate on STRING with length LENGTH,
288 while the later two operate on concatenation of STRING1 and STRING2
289 with lengths LENGTH1 and LENGTH2, respectively.
291 re_match() matches the compiled pattern in BUFP against the string,
292 starting at index START.
294 re_search() first tries matching at index START, then it tries to match
295 starting from index START + 1, and so on. The last start position tried
296 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
299 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
300 the first STOP characters of the concatenation of the strings should be
303 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
304 and all groups is stored in REGS. (For the "_2" variants, the offsets are
305 computed relative to the concatenation, not relative to the individual
308 On success, re_match* functions return the length of the match, re_search*
309 return the position of the start of the match. Return value -1 means no
310 match was found and -2 indicates an internal error. */
313 re_match (bufp, string, length, start, regs)
314 struct re_pattern_buffer *bufp;
317 struct re_registers *regs;
319 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
322 weak_alias (__re_match, re_match)
326 re_search (bufp, string, length, start, range, regs)
327 struct re_pattern_buffer *bufp;
331 struct re_registers *regs;
333 return re_search_stub (bufp, string, length, start, range, length, regs,
337 weak_alias (__re_search, re_search)
341 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
342 struct re_pattern_buffer *bufp;
343 const char *string1, *string2;
344 Idx length1, length2, start, stop;
345 struct re_registers *regs;
347 return re_search_2_stub (bufp, string1, length1, string2, length2,
348 start, 0, regs, stop, true);
351 weak_alias (__re_match_2, re_match_2)
355 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
356 struct re_pattern_buffer *bufp;
357 const char *string1, *string2;
358 Idx length1, length2, start, stop;
360 struct re_registers *regs;
362 return re_search_2_stub (bufp, string1, length1, string2, length2,
363 start, range, regs, stop, false);
366 weak_alias (__re_search_2, re_search_2)
371 re_search_2_stub (struct re_pattern_buffer *bufp,
372 const char *string1, Idx length1,
373 const char *string2, Idx length2,
374 Idx start, regoff_t range, struct re_registers *regs,
375 Idx stop, bool ret_len)
379 Idx len = length1 + length2;
382 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
385 /* Concatenate the strings. */
389 s = re_malloc (char, len);
391 if (BE (s == NULL, 0))
394 memcpy (__mempcpy (s, string1, length1), string2, length2);
396 memcpy (s, string1, length1);
397 memcpy (s + length1, string2, length2);
406 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
412 /* The parameters have the same meaning as those of re_search.
413 Additional parameters:
414 If RET_LEN is true the length of the match is returned (re_match style);
415 otherwise the position of the match is returned. */
419 re_search_stub (struct re_pattern_buffer *bufp,
420 const char *string, Idx length,
421 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
424 reg_errcode_t result;
430 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
432 Idx last_start = start + range;
434 /* Check for out-of-range. */
435 if (BE (start < 0 || start > length, 0))
437 if (BE (length < last_start || (0 <= range && last_start < start), 0))
439 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
442 __libc_lock_lock (dfa->lock);
444 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
445 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
447 /* Compile fastmap if we haven't yet. */
448 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
449 re_compile_fastmap (bufp);
451 if (BE (bufp->no_sub, 0))
454 /* We need at least 1 register. */
457 else if (BE (bufp->regs_allocated == REGS_FIXED
458 && regs->num_regs <= bufp->re_nsub, 0))
460 nregs = regs->num_regs;
461 if (BE (nregs < 1, 0))
463 /* Nothing can be copied to regs. */
469 nregs = bufp->re_nsub + 1;
470 pmatch = re_malloc (regmatch_t, nregs);
471 if (BE (pmatch == NULL, 0))
477 result = re_search_internal (bufp, string, length, start, last_start, stop,
478 nregs, pmatch, eflags);
482 /* I hope we needn't fill ther regs with -1's when no match was found. */
483 if (result != REG_NOERROR)
485 else if (regs != NULL)
487 /* If caller wants register contents data back, copy them. */
488 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
489 bufp->regs_allocated);
490 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
494 if (BE (rval == 0, 1))
498 assert (pmatch[0].rm_so == start);
499 rval = pmatch[0].rm_eo - start;
502 rval = pmatch[0].rm_so;
506 __libc_lock_unlock (dfa->lock);
512 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
515 int rval = REGS_REALLOCATE;
517 Idx need_regs = nregs + 1;
518 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
521 /* Have the register data arrays been allocated? */
522 if (regs_allocated == REGS_UNALLOCATED)
523 { /* No. So allocate them with malloc. */
524 regs->start = re_malloc (regoff_t, need_regs);
525 if (BE (regs->start == NULL, 0))
526 return REGS_UNALLOCATED;
527 regs->end = re_malloc (regoff_t, need_regs);
528 if (BE (regs->end == NULL, 0))
530 re_free (regs->start);
531 return REGS_UNALLOCATED;
533 regs->num_regs = need_regs;
535 else if (regs_allocated == REGS_REALLOCATE)
536 { /* Yes. If we need more elements than were already
537 allocated, reallocate them. If we need fewer, just
539 if (BE (need_regs > regs->num_regs, 0))
541 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
543 if (BE (new_start == NULL, 0))
544 return REGS_UNALLOCATED;
545 new_end = re_realloc (regs->end, regoff_t, need_regs);
546 if (BE (new_end == NULL, 0))
549 return REGS_UNALLOCATED;
551 regs->start = new_start;
553 regs->num_regs = need_regs;
558 assert (regs_allocated == REGS_FIXED);
559 /* This function may not be called with REGS_FIXED and nregs too big. */
560 assert (regs->num_regs >= nregs);
565 for (i = 0; i < nregs; ++i)
567 regs->start[i] = pmatch[i].rm_so;
568 regs->end[i] = pmatch[i].rm_eo;
570 for ( ; i < regs->num_regs; ++i)
571 regs->start[i] = regs->end[i] = -1;
576 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
577 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
578 this memory for recording register information. STARTS and ENDS
579 must be allocated using the malloc library routine, and must each
580 be at least NUM_REGS * sizeof (regoff_t) bytes long.
582 If NUM_REGS == 0, then subsequent matches should allocate their own
585 Unless this function is called, the first search or match using
586 PATTERN_BUFFER will allocate its own register data, without
587 freeing the old data. */
590 re_set_registers (bufp, regs, num_regs, starts, ends)
591 struct re_pattern_buffer *bufp;
592 struct re_registers *regs;
593 __re_size_t num_regs;
594 regoff_t *starts, *ends;
598 bufp->regs_allocated = REGS_REALLOCATE;
599 regs->num_regs = num_regs;
600 regs->start = starts;
605 bufp->regs_allocated = REGS_UNALLOCATED;
607 regs->start = regs->end = NULL;
611 weak_alias (__re_set_registers, re_set_registers)
614 /* Entry points compatible with 4.2 BSD regex library. We don't define
615 them unless specifically requested. */
617 #if defined _REGEX_RE_COMP || defined _LIBC
625 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
627 #endif /* _REGEX_RE_COMP */
629 /* Internal entry point. */
631 /* Searches for a compiled pattern PREG in the string STRING, whose
632 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
633 meaning as with regexec. LAST_START is START + RANGE, where
634 START and RANGE have the same meaning as with re_search.
635 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
636 otherwise return the error code.
637 Note: We assume front end functions already check ranges.
638 (0 <= LAST_START && LAST_START <= LENGTH) */
641 internal_function __attribute_warn_unused_result__
642 re_search_internal (const regex_t *preg,
643 const char *string, Idx length,
644 Idx start, Idx last_start, Idx stop,
645 size_t nmatch, regmatch_t pmatch[],
649 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
650 Idx left_lim, right_lim;
652 bool fl_longest_match;
655 Idx match_last = REG_MISSING;
659 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
660 re_match_context_t mctx = { .dfa = dfa };
662 re_match_context_t mctx;
664 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
665 && start != last_start && !preg->can_be_null)
666 ? preg->fastmap : NULL);
667 RE_TRANSLATE_TYPE t = preg->translate;
669 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
670 memset (&mctx, '\0', sizeof (re_match_context_t));
674 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
675 nmatch -= extra_nmatch;
677 /* Check if the DFA haven't been compiled. */
678 if (BE (preg->used == 0 || dfa->init_state == NULL
679 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
680 || dfa->init_state_begbuf == NULL, 0))
684 /* We assume front-end functions already check them. */
685 assert (0 <= last_start && last_start <= length);
688 /* If initial states with non-begbuf contexts have no elements,
689 the regex must be anchored. If preg->newline_anchor is set,
690 we'll never use init_state_nl, so do not check it. */
691 if (dfa->init_state->nodes.nelem == 0
692 && dfa->init_state_word->nodes.nelem == 0
693 && (dfa->init_state_nl->nodes.nelem == 0
694 || !preg->newline_anchor))
696 if (start != 0 && last_start != 0)
698 start = last_start = 0;
701 /* We must check the longest matching, if nmatch > 0. */
702 fl_longest_match = (nmatch != 0 || dfa->nbackref);
704 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
705 preg->translate, (preg->syntax & RE_ICASE) != 0,
707 if (BE (err != REG_NOERROR, 0))
709 mctx.input.stop = stop;
710 mctx.input.raw_stop = stop;
711 mctx.input.newline_anchor = preg->newline_anchor;
713 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
714 if (BE (err != REG_NOERROR, 0))
717 /* We will log all the DFA states through which the dfa pass,
718 if nmatch > 1, or this dfa has "multibyte node", which is a
719 back-reference or a node which can accept multibyte character or
720 multi character collating element. */
721 if (nmatch > 1 || dfa->has_mb_node)
723 /* Avoid overflow. */
724 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
730 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
731 if (BE (mctx.state_log == NULL, 0))
738 mctx.state_log = NULL;
741 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
742 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
744 /* Check incrementally whether of not the input string match. */
745 incr = (last_start < start) ? -1 : 1;
746 left_lim = (last_start < start) ? last_start : start;
747 right_lim = (last_start < start) ? start : last_start;
748 sb = dfa->mb_cur_max == 1;
751 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
752 | (start <= last_start ? 2 : 0)
753 | (t != NULL ? 1 : 0))
756 for (;; match_first += incr)
759 if (match_first < left_lim || right_lim < match_first)
762 /* Advance as rapidly as possible through the string, until we
763 find a plausible place to start matching. This may be done
764 with varying efficiency, so there are various possibilities:
765 only the most common of them are specialized, in order to
766 save on code size. We use a switch statement for speed. */
774 /* Fastmap with single-byte translation, match forward. */
775 while (BE (match_first < right_lim, 1)
776 && !fastmap[t[(unsigned char) string[match_first]]])
778 goto forward_match_found_start_or_reached_end;
781 /* Fastmap without translation, match forward. */
782 while (BE (match_first < right_lim, 1)
783 && !fastmap[(unsigned char) string[match_first]])
786 forward_match_found_start_or_reached_end:
787 if (BE (match_first == right_lim, 0))
789 ch = match_first >= length
790 ? 0 : (unsigned char) string[match_first];
791 if (!fastmap[t ? t[ch] : ch])
798 /* Fastmap without multi-byte translation, match backwards. */
799 while (match_first >= left_lim)
801 ch = match_first >= length
802 ? 0 : (unsigned char) string[match_first];
803 if (fastmap[t ? t[ch] : ch])
807 if (match_first < left_lim)
812 /* In this case, we can't determine easily the current byte,
813 since it might be a component byte of a multibyte
814 character. Then we use the constructed buffer instead. */
817 /* If MATCH_FIRST is out of the valid range, reconstruct the
819 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
820 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
822 err = re_string_reconstruct (&mctx.input, match_first,
824 if (BE (err != REG_NOERROR, 0))
827 offset = match_first - mctx.input.raw_mbs_idx;
829 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
830 Note that MATCH_FIRST must not be smaller than 0. */
831 ch = (match_first >= length
832 ? 0 : re_string_byte_at (&mctx.input, offset));
836 if (match_first < left_lim || match_first > right_lim)
845 /* Reconstruct the buffers so that the matcher can assume that
846 the matching starts from the beginning of the buffer. */
847 err = re_string_reconstruct (&mctx.input, match_first, eflags);
848 if (BE (err != REG_NOERROR, 0))
851 #ifdef RE_ENABLE_I18N
852 /* Don't consider this char as a possible match start if it part,
853 yet isn't the head, of a multibyte character. */
854 if (!sb && !re_string_first_byte (&mctx.input, 0))
858 /* It seems to be appropriate one, then use the matcher. */
859 /* We assume that the matching starts from 0. */
860 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
861 match_last = check_matching (&mctx, fl_longest_match,
862 start <= last_start ? &match_first : NULL);
863 if (match_last != REG_MISSING)
865 if (BE (match_last == REG_ERROR, 0))
872 mctx.match_last = match_last;
873 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
875 re_dfastate_t *pstate = mctx.state_log[match_last];
876 mctx.last_node = check_halt_state_context (&mctx, pstate,
879 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
882 err = prune_impossible_nodes (&mctx);
883 if (err == REG_NOERROR)
885 if (BE (err != REG_NOMATCH, 0))
887 match_last = REG_MISSING;
890 break; /* We found a match. */
894 match_ctx_clean (&mctx);
898 assert (match_last != REG_MISSING);
899 assert (err == REG_NOERROR);
902 /* Set pmatch[] if we need. */
907 /* Initialize registers. */
908 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
909 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
911 /* Set the points where matching start/end. */
913 pmatch[0].rm_eo = mctx.match_last;
914 /* FIXME: This function should fail if mctx.match_last exceeds
915 the maximum possible regoff_t value. We need a new error
916 code REG_OVERFLOW. */
918 if (!preg->no_sub && nmatch > 1)
920 err = set_regs (preg, &mctx, nmatch, pmatch,
921 dfa->has_plural_match && dfa->nbackref > 0);
922 if (BE (err != REG_NOERROR, 0))
926 /* At last, add the offset to the each registers, since we slided
927 the buffers so that we could assume that the matching starts
929 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
930 if (pmatch[reg_idx].rm_so != -1)
932 #ifdef RE_ENABLE_I18N
933 if (BE (mctx.input.offsets_needed != 0, 0))
935 pmatch[reg_idx].rm_so =
936 (pmatch[reg_idx].rm_so == mctx.input.valid_len
937 ? mctx.input.valid_raw_len
938 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
939 pmatch[reg_idx].rm_eo =
940 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
941 ? mctx.input.valid_raw_len
942 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
945 assert (mctx.input.offsets_needed == 0);
947 pmatch[reg_idx].rm_so += match_first;
948 pmatch[reg_idx].rm_eo += match_first;
950 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
952 pmatch[nmatch + reg_idx].rm_so = -1;
953 pmatch[nmatch + reg_idx].rm_eo = -1;
957 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
958 if (dfa->subexp_map[reg_idx] != reg_idx)
960 pmatch[reg_idx + 1].rm_so
961 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
962 pmatch[reg_idx + 1].rm_eo
963 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
968 re_free (mctx.state_log);
970 match_ctx_free (&mctx);
971 re_string_destruct (&mctx.input);
976 internal_function __attribute_warn_unused_result__
977 prune_impossible_nodes (re_match_context_t *mctx)
979 const re_dfa_t *const dfa = mctx->dfa;
980 Idx halt_node, match_last;
982 re_dfastate_t **sifted_states;
983 re_dfastate_t **lim_states = NULL;
984 re_sift_context_t sctx;
986 assert (mctx->state_log != NULL);
988 match_last = mctx->match_last;
989 halt_node = mctx->last_node;
991 /* Avoid overflow. */
992 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
995 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
996 if (BE (sifted_states == NULL, 0))
1003 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
1004 if (BE (lim_states == NULL, 0))
1011 memset (lim_states, '\0',
1012 sizeof (re_dfastate_t *) * (match_last + 1));
1013 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1015 ret = sift_states_backward (mctx, &sctx);
1016 re_node_set_free (&sctx.limits);
1017 if (BE (ret != REG_NOERROR, 0))
1019 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1024 if (! REG_VALID_INDEX (match_last))
1029 } while (mctx->state_log[match_last] == NULL
1030 || !mctx->state_log[match_last]->halt);
1031 halt_node = check_halt_state_context (mctx,
1032 mctx->state_log[match_last],
1035 ret = merge_state_array (dfa, sifted_states, lim_states,
1037 re_free (lim_states);
1039 if (BE (ret != REG_NOERROR, 0))
1044 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1045 ret = sift_states_backward (mctx, &sctx);
1046 re_node_set_free (&sctx.limits);
1047 if (BE (ret != REG_NOERROR, 0))
1049 if (sifted_states[0] == NULL)
1055 re_free (mctx->state_log);
1056 mctx->state_log = sifted_states;
1057 sifted_states = NULL;
1058 mctx->last_node = halt_node;
1059 mctx->match_last = match_last;
1062 re_free (sifted_states);
1063 re_free (lim_states);
1067 /* Acquire an initial state and return it.
1068 We must select appropriate initial state depending on the context,
1069 since initial states may have constraints like "\<", "^", etc.. */
1071 static inline re_dfastate_t *
1072 __attribute ((always_inline)) internal_function
1073 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1076 const re_dfa_t *const dfa = mctx->dfa;
1077 if (dfa->init_state->has_constraint)
1079 unsigned int context;
1080 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1081 if (IS_WORD_CONTEXT (context))
1082 return dfa->init_state_word;
1083 else if (IS_ORDINARY_CONTEXT (context))
1084 return dfa->init_state;
1085 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1086 return dfa->init_state_begbuf;
1087 else if (IS_NEWLINE_CONTEXT (context))
1088 return dfa->init_state_nl;
1089 else if (IS_BEGBUF_CONTEXT (context))
1091 /* It is relatively rare case, then calculate on demand. */
1092 return re_acquire_state_context (err, dfa,
1093 dfa->init_state->entrance_nodes,
1097 /* Must not happen? */
1098 return dfa->init_state;
1101 return dfa->init_state;
1104 /* Check whether the regular expression match input string INPUT or not,
1105 and return the index where the matching end. Return REG_MISSING if
1106 there is no match, and return REG_ERROR in case of an error.
1107 FL_LONGEST_MATCH means we want the POSIX longest matching.
1108 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1109 next place where we may want to try matching.
1110 Note that the matcher assume that the maching starts from the current
1111 index of the buffer. */
1114 internal_function __attribute_warn_unused_result__
1115 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1118 const re_dfa_t *const dfa = mctx->dfa;
1121 Idx match_last = REG_MISSING;
1122 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1123 re_dfastate_t *cur_state;
1124 bool at_init_state = p_match_first != NULL;
1125 Idx next_start_idx = cur_str_idx;
1128 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1129 /* An initial state must not be NULL (invalid). */
1130 if (BE (cur_state == NULL, 0))
1132 assert (err == REG_ESPACE);
1136 if (mctx->state_log != NULL)
1138 mctx->state_log[cur_str_idx] = cur_state;
1140 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1141 later. E.g. Processing back references. */
1142 if (BE (dfa->nbackref, 0))
1144 at_init_state = false;
1145 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1146 if (BE (err != REG_NOERROR, 0))
1149 if (cur_state->has_backref)
1151 err = transit_state_bkref (mctx, &cur_state->nodes);
1152 if (BE (err != REG_NOERROR, 0))
1158 /* If the RE accepts NULL string. */
1159 if (BE (cur_state->halt, 0))
1161 if (!cur_state->has_constraint
1162 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1164 if (!fl_longest_match)
1168 match_last = cur_str_idx;
1174 while (!re_string_eoi (&mctx->input))
1176 re_dfastate_t *old_state = cur_state;
1177 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1179 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1180 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1181 && mctx->input.valid_len < mctx->input.len))
1183 err = extend_buffers (mctx);
1184 if (BE (err != REG_NOERROR, 0))
1186 assert (err == REG_ESPACE);
1191 cur_state = transit_state (&err, mctx, cur_state);
1192 if (mctx->state_log != NULL)
1193 cur_state = merge_state_with_log (&err, mctx, cur_state);
1195 if (cur_state == NULL)
1197 /* Reached the invalid state or an error. Try to recover a valid
1198 state using the state log, if available and if we have not
1199 already found a valid (even if not the longest) match. */
1200 if (BE (err != REG_NOERROR, 0))
1203 if (mctx->state_log == NULL
1204 || (match && !fl_longest_match)
1205 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1209 if (BE (at_init_state, 0))
1211 if (old_state == cur_state)
1212 next_start_idx = next_char_idx;
1214 at_init_state = false;
1217 if (cur_state->halt)
1219 /* Reached a halt state.
1220 Check the halt state can satisfy the current context. */
1221 if (!cur_state->has_constraint
1222 || check_halt_state_context (mctx, cur_state,
1223 re_string_cur_idx (&mctx->input)))
1225 /* We found an appropriate halt state. */
1226 match_last = re_string_cur_idx (&mctx->input);
1229 /* We found a match, do not modify match_first below. */
1230 p_match_first = NULL;
1231 if (!fl_longest_match)
1238 *p_match_first += next_start_idx;
1243 /* Check NODE match the current context. */
1247 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1249 re_token_type_t type = dfa->nodes[node].type;
1250 unsigned int constraint = dfa->nodes[node].constraint;
1251 if (type != END_OF_RE)
1255 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1260 /* Check the halt state STATE match the current context.
1261 Return 0 if not match, if the node, STATE has, is a halt node and
1262 match the context, return the node. */
1266 check_halt_state_context (const re_match_context_t *mctx,
1267 const re_dfastate_t *state, Idx idx)
1270 unsigned int context;
1272 assert (state->halt);
1274 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1275 for (i = 0; i < state->nodes.nelem; ++i)
1276 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1277 return state->nodes.elems[i];
1281 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1282 corresponding to the DFA).
1283 Return the destination node, and update EPS_VIA_NODES;
1284 return REG_MISSING in case of errors. */
1288 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1289 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1290 struct re_fail_stack_t *fs)
1292 const re_dfa_t *const dfa = mctx->dfa;
1295 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1297 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1298 re_node_set *edests = &dfa->edests[node];
1300 ok = re_node_set_insert (eps_via_nodes, node);
1303 /* Pick up a valid destination, or return REG_MISSING if none
1305 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1307 Idx candidate = edests->elems[i];
1308 if (!re_node_set_contains (cur_nodes, candidate))
1310 if (dest_node == REG_MISSING)
1311 dest_node = candidate;
1315 /* In order to avoid infinite loop like "(a*)*", return the second
1316 epsilon-transition if the first was already considered. */
1317 if (re_node_set_contains (eps_via_nodes, dest_node))
1320 /* Otherwise, push the second epsilon-transition on the fail stack. */
1322 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1326 /* We know we are going to exit. */
1335 re_token_type_t type = dfa->nodes[node].type;
1337 #ifdef RE_ENABLE_I18N
1338 if (dfa->nodes[node].accept_mb)
1339 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1341 #endif /* RE_ENABLE_I18N */
1342 if (type == OP_BACK_REF)
1344 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1345 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1348 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1352 char *buf = (char *) re_string_get_buffer (&mctx->input);
1353 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1362 ok = re_node_set_insert (eps_via_nodes, node);
1365 dest_node = dfa->edests[node].elems[0];
1366 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1373 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1375 Idx dest_node = dfa->nexts[node];
1376 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1377 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1378 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1381 re_node_set_empty (eps_via_nodes);
1388 static reg_errcode_t
1389 internal_function __attribute_warn_unused_result__
1390 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1391 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1394 Idx num = fs->num++;
1395 if (fs->num == fs->alloc)
1397 struct re_fail_stack_ent_t *new_array;
1398 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1400 if (new_array == NULL)
1403 fs->stack = new_array;
1405 fs->stack[num].idx = str_idx;
1406 fs->stack[num].node = dest_node;
1407 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1408 if (fs->stack[num].regs == NULL)
1410 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1411 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1417 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1418 regmatch_t *regs, re_node_set *eps_via_nodes)
1420 Idx num = --fs->num;
1421 assert (REG_VALID_INDEX (num));
1422 *pidx = fs->stack[num].idx;
1423 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1424 re_node_set_free (eps_via_nodes);
1425 re_free (fs->stack[num].regs);
1426 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1427 return fs->stack[num].node;
1430 /* Set the positions where the subexpressions are starts/ends to registers
1432 Note: We assume that pmatch[0] is already set, and
1433 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1435 static reg_errcode_t
1436 internal_function __attribute_warn_unused_result__
1437 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1438 regmatch_t *pmatch, bool fl_backtrack)
1440 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1442 re_node_set eps_via_nodes;
1443 struct re_fail_stack_t *fs;
1444 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1445 regmatch_t *prev_idx_match;
1446 bool prev_idx_match_malloced = false;
1449 assert (nmatch > 1);
1450 assert (mctx->state_log != NULL);
1455 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1456 if (fs->stack == NULL)
1462 cur_node = dfa->init_node;
1463 re_node_set_init_empty (&eps_via_nodes);
1465 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1466 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1469 prev_idx_match = re_malloc (regmatch_t, nmatch);
1470 if (prev_idx_match == NULL)
1472 free_fail_stack_return (fs);
1475 prev_idx_match_malloced = true;
1477 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1479 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1481 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1483 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1488 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1489 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1491 if (reg_idx == nmatch)
1493 re_node_set_free (&eps_via_nodes);
1494 if (prev_idx_match_malloced)
1495 re_free (prev_idx_match);
1496 return free_fail_stack_return (fs);
1498 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1503 re_node_set_free (&eps_via_nodes);
1504 if (prev_idx_match_malloced)
1505 re_free (prev_idx_match);
1510 /* Proceed to next node. */
1511 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1512 &eps_via_nodes, fs);
1514 if (BE (! REG_VALID_INDEX (cur_node), 0))
1516 if (BE (cur_node == REG_ERROR, 0))
1518 re_node_set_free (&eps_via_nodes);
1519 if (prev_idx_match_malloced)
1520 re_free (prev_idx_match);
1521 free_fail_stack_return (fs);
1525 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1529 re_node_set_free (&eps_via_nodes);
1530 if (prev_idx_match_malloced)
1531 re_free (prev_idx_match);
1536 re_node_set_free (&eps_via_nodes);
1537 if (prev_idx_match_malloced)
1538 re_free (prev_idx_match);
1539 return free_fail_stack_return (fs);
1542 static reg_errcode_t
1544 free_fail_stack_return (struct re_fail_stack_t *fs)
1549 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1551 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1552 re_free (fs->stack[fs_idx].regs);
1554 re_free (fs->stack);
1561 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1562 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1564 int type = dfa->nodes[cur_node].type;
1565 if (type == OP_OPEN_SUBEXP)
1567 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1569 /* We are at the first node of this sub expression. */
1570 if (reg_num < nmatch)
1572 pmatch[reg_num].rm_so = cur_idx;
1573 pmatch[reg_num].rm_eo = -1;
1576 else if (type == OP_CLOSE_SUBEXP)
1578 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1579 if (reg_num < nmatch)
1581 /* We are at the last node of this sub expression. */
1582 if (pmatch[reg_num].rm_so < cur_idx)
1584 pmatch[reg_num].rm_eo = cur_idx;
1585 /* This is a non-empty match or we are not inside an optional
1586 subexpression. Accept this right away. */
1587 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1591 if (dfa->nodes[cur_node].opt_subexp
1592 && prev_idx_match[reg_num].rm_so != -1)
1593 /* We transited through an empty match for an optional
1594 subexpression, like (a?)*, and this is not the subexp's
1595 first match. Copy back the old content of the registers
1596 so that matches of an inner subexpression are undone as
1597 well, like in ((a?))*. */
1598 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1600 /* We completed a subexpression, but it may be part of
1601 an optional one, so do not update PREV_IDX_MATCH. */
1602 pmatch[reg_num].rm_eo = cur_idx;
1608 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1609 and sift the nodes in each states according to the following rules.
1610 Updated state_log will be wrote to STATE_LOG.
1612 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1613 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1614 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1615 the LAST_NODE, we throw away the node `a'.
1616 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1617 string `s' and transit to `b':
1618 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1620 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1621 thrown away, we throw away the node `a'.
1622 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1623 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1625 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1626 we throw away the node `a'. */
1628 #define STATE_NODE_CONTAINS(state,node) \
1629 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1631 static reg_errcode_t
1633 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1637 Idx str_idx = sctx->last_str_idx;
1638 re_node_set cur_dest;
1641 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1644 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1645 transit to the last_node and the last_node itself. */
1646 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1647 if (BE (err != REG_NOERROR, 0))
1649 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1650 if (BE (err != REG_NOERROR, 0))
1653 /* Then check each states in the state_log. */
1656 /* Update counters. */
1657 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1658 if (null_cnt > mctx->max_mb_elem_len)
1660 memset (sctx->sifted_states, '\0',
1661 sizeof (re_dfastate_t *) * str_idx);
1662 re_node_set_free (&cur_dest);
1665 re_node_set_empty (&cur_dest);
1668 if (mctx->state_log[str_idx])
1670 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1671 if (BE (err != REG_NOERROR, 0))
1675 /* Add all the nodes which satisfy the following conditions:
1676 - It can epsilon transit to a node in CUR_DEST.
1678 And update state_log. */
1679 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1680 if (BE (err != REG_NOERROR, 0))
1685 re_node_set_free (&cur_dest);
1689 static reg_errcode_t
1690 internal_function __attribute_warn_unused_result__
1691 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1692 Idx str_idx, re_node_set *cur_dest)
1694 const re_dfa_t *const dfa = mctx->dfa;
1695 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1698 /* Then build the next sifted state.
1699 We build the next sifted state on `cur_dest', and update
1700 `sifted_states[str_idx]' with `cur_dest'.
1702 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1703 `cur_src' points the node_set of the old `state_log[str_idx]'
1704 (with the epsilon nodes pre-filtered out). */
1705 for (i = 0; i < cur_src->nelem; i++)
1707 Idx prev_node = cur_src->elems[i];
1712 re_token_type_t type = dfa->nodes[prev_node].type;
1713 assert (!IS_EPSILON_NODE (type));
1715 #ifdef RE_ENABLE_I18N
1716 /* If the node may accept `multi byte'. */
1717 if (dfa->nodes[prev_node].accept_mb)
1718 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1719 str_idx, sctx->last_str_idx);
1720 #endif /* RE_ENABLE_I18N */
1722 /* We don't check backreferences here.
1723 See update_cur_sifted_state(). */
1725 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1726 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1727 dfa->nexts[prev_node]))
1733 if (sctx->limits.nelem)
1735 Idx to_idx = str_idx + naccepted;
1736 if (check_dst_limits (mctx, &sctx->limits,
1737 dfa->nexts[prev_node], to_idx,
1738 prev_node, str_idx))
1741 ok = re_node_set_insert (cur_dest, prev_node);
1749 /* Helper functions. */
1751 static reg_errcode_t
1753 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1755 Idx top = mctx->state_log_top;
1757 if (next_state_log_idx >= mctx->input.bufs_len
1758 || (next_state_log_idx >= mctx->input.valid_len
1759 && mctx->input.valid_len < mctx->input.len))
1762 err = extend_buffers (mctx);
1763 if (BE (err != REG_NOERROR, 0))
1767 if (top < next_state_log_idx)
1769 memset (mctx->state_log + top + 1, '\0',
1770 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1771 mctx->state_log_top = next_state_log_idx;
1776 static reg_errcode_t
1778 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1779 re_dfastate_t **src, Idx num)
1783 for (st_idx = 0; st_idx < num; ++st_idx)
1785 if (dst[st_idx] == NULL)
1786 dst[st_idx] = src[st_idx];
1787 else if (src[st_idx] != NULL)
1789 re_node_set merged_set;
1790 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1791 &src[st_idx]->nodes);
1792 if (BE (err != REG_NOERROR, 0))
1794 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1795 re_node_set_free (&merged_set);
1796 if (BE (err != REG_NOERROR, 0))
1803 static reg_errcode_t
1805 update_cur_sifted_state (const re_match_context_t *mctx,
1806 re_sift_context_t *sctx, Idx str_idx,
1807 re_node_set *dest_nodes)
1809 const re_dfa_t *const dfa = mctx->dfa;
1810 reg_errcode_t err = REG_NOERROR;
1811 const re_node_set *candidates;
1812 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1813 : &mctx->state_log[str_idx]->nodes);
1815 if (dest_nodes->nelem == 0)
1816 sctx->sifted_states[str_idx] = NULL;
1821 /* At first, add the nodes which can epsilon transit to a node in
1823 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1824 if (BE (err != REG_NOERROR, 0))
1827 /* Then, check the limitations in the current sift_context. */
1828 if (sctx->limits.nelem)
1830 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1831 mctx->bkref_ents, str_idx);
1832 if (BE (err != REG_NOERROR, 0))
1837 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1838 if (BE (err != REG_NOERROR, 0))
1842 if (candidates && mctx->state_log[str_idx]->has_backref)
1844 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1845 if (BE (err != REG_NOERROR, 0))
1851 static reg_errcode_t
1852 internal_function __attribute_warn_unused_result__
1853 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1854 const re_node_set *candidates)
1856 reg_errcode_t err = REG_NOERROR;
1859 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1860 if (BE (err != REG_NOERROR, 0))
1863 if (!state->inveclosure.alloc)
1865 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1866 if (BE (err != REG_NOERROR, 0))
1868 for (i = 0; i < dest_nodes->nelem; i++)
1870 err = re_node_set_merge (&state->inveclosure,
1871 dfa->inveclosures + dest_nodes->elems[i]);
1872 if (BE (err != REG_NOERROR, 0))
1876 return re_node_set_add_intersect (dest_nodes, candidates,
1877 &state->inveclosure);
1880 static reg_errcode_t
1882 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1883 const re_node_set *candidates)
1887 re_node_set *inv_eclosure = dfa->inveclosures + node;
1888 re_node_set except_nodes;
1889 re_node_set_init_empty (&except_nodes);
1890 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1892 Idx cur_node = inv_eclosure->elems[ecl_idx];
1893 if (cur_node == node)
1895 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1897 Idx edst1 = dfa->edests[cur_node].elems[0];
1898 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1899 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1900 if ((!re_node_set_contains (inv_eclosure, edst1)
1901 && re_node_set_contains (dest_nodes, edst1))
1902 || (REG_VALID_NONZERO_INDEX (edst2)
1903 && !re_node_set_contains (inv_eclosure, edst2)
1904 && re_node_set_contains (dest_nodes, edst2)))
1906 err = re_node_set_add_intersect (&except_nodes, candidates,
1907 dfa->inveclosures + cur_node);
1908 if (BE (err != REG_NOERROR, 0))
1910 re_node_set_free (&except_nodes);
1916 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1918 Idx cur_node = inv_eclosure->elems[ecl_idx];
1919 if (!re_node_set_contains (&except_nodes, cur_node))
1921 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1922 re_node_set_remove_at (dest_nodes, idx);
1925 re_node_set_free (&except_nodes);
1931 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1932 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1934 const re_dfa_t *const dfa = mctx->dfa;
1935 Idx lim_idx, src_pos, dst_pos;
1937 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1938 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1939 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1942 struct re_backref_cache_entry *ent;
1943 ent = mctx->bkref_ents + limits->elems[lim_idx];
1944 subexp_idx = dfa->nodes[ent->node].opr.idx;
1946 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1947 subexp_idx, dst_node, dst_idx,
1949 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1950 subexp_idx, src_node, src_idx,
1954 <src> <dst> ( <subexp> )
1955 ( <subexp> ) <src> <dst>
1956 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1957 if (src_pos == dst_pos)
1958 continue; /* This is unrelated limitation. */
1967 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1968 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1970 const re_dfa_t *const dfa = mctx->dfa;
1971 const re_node_set *eclosures = dfa->eclosures + from_node;
1974 /* Else, we are on the boundary: examine the nodes on the epsilon
1976 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1978 Idx node = eclosures->elems[node_idx];
1979 switch (dfa->nodes[node].type)
1982 if (bkref_idx != REG_MISSING)
1984 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1990 if (ent->node != node)
1993 if (subexp_idx < BITSET_WORD_BITS
1994 && !(ent->eps_reachable_subexps_map
1995 & ((bitset_word_t) 1 << subexp_idx)))
1998 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1999 OP_CLOSE_SUBEXP cases below. But, if the
2000 destination node is the same node as the source
2001 node, don't recurse because it would cause an
2002 infinite loop: a regex that exhibits this behavior
2004 dst = dfa->edests[node].elems[0];
2005 if (dst == from_node)
2009 else /* if (boundaries & 2) */
2014 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2016 if (cpos == -1 /* && (boundaries & 1) */)
2018 if (cpos == 0 && (boundaries & 2))
2021 if (subexp_idx < BITSET_WORD_BITS)
2022 ent->eps_reachable_subexps_map
2023 &= ~((bitset_word_t) 1 << subexp_idx);
2025 while (ent++->more);
2029 case OP_OPEN_SUBEXP:
2030 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2034 case OP_CLOSE_SUBEXP:
2035 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2044 return (boundaries & 2) ? 1 : 0;
2049 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2050 Idx subexp_idx, Idx from_node, Idx str_idx,
2053 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2056 /* If we are outside the range of the subexpression, return -1 or 1. */
2057 if (str_idx < lim->subexp_from)
2060 if (lim->subexp_to < str_idx)
2063 /* If we are within the subexpression, return 0. */
2064 boundaries = (str_idx == lim->subexp_from);
2065 boundaries |= (str_idx == lim->subexp_to) << 1;
2066 if (boundaries == 0)
2069 /* Else, examine epsilon closure. */
2070 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2071 from_node, bkref_idx);
2074 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2075 which are against limitations from DEST_NODES. */
2077 static reg_errcode_t
2079 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2080 const re_node_set *candidates, re_node_set *limits,
2081 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2084 Idx node_idx, lim_idx;
2086 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2089 struct re_backref_cache_entry *ent;
2090 ent = bkref_ents + limits->elems[lim_idx];
2092 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2093 continue; /* This is unrelated limitation. */
2095 subexp_idx = dfa->nodes[ent->node].opr.idx;
2096 if (ent->subexp_to == str_idx)
2098 Idx ops_node = REG_MISSING;
2099 Idx cls_node = REG_MISSING;
2100 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2102 Idx node = dest_nodes->elems[node_idx];
2103 re_token_type_t type = dfa->nodes[node].type;
2104 if (type == OP_OPEN_SUBEXP
2105 && subexp_idx == dfa->nodes[node].opr.idx)
2107 else if (type == OP_CLOSE_SUBEXP
2108 && subexp_idx == dfa->nodes[node].opr.idx)
2112 /* Check the limitation of the open subexpression. */
2113 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2114 if (REG_VALID_INDEX (ops_node))
2116 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2118 if (BE (err != REG_NOERROR, 0))
2122 /* Check the limitation of the close subexpression. */
2123 if (REG_VALID_INDEX (cls_node))
2124 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2126 Idx node = dest_nodes->elems[node_idx];
2127 if (!re_node_set_contains (dfa->inveclosures + node,
2129 && !re_node_set_contains (dfa->eclosures + node,
2132 /* It is against this limitation.
2133 Remove it form the current sifted state. */
2134 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2136 if (BE (err != REG_NOERROR, 0))
2142 else /* (ent->subexp_to != str_idx) */
2144 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2146 Idx node = dest_nodes->elems[node_idx];
2147 re_token_type_t type = dfa->nodes[node].type;
2148 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2150 if (subexp_idx != dfa->nodes[node].opr.idx)
2152 /* It is against this limitation.
2153 Remove it form the current sifted state. */
2154 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2156 if (BE (err != REG_NOERROR, 0))
2165 static reg_errcode_t
2166 internal_function __attribute_warn_unused_result__
2167 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2168 Idx str_idx, const re_node_set *candidates)
2170 const re_dfa_t *const dfa = mctx->dfa;
2173 re_sift_context_t local_sctx;
2174 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2176 if (first_idx == REG_MISSING)
2179 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2181 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2184 re_token_type_t type;
2185 struct re_backref_cache_entry *entry;
2186 node = candidates->elems[node_idx];
2187 type = dfa->nodes[node].type;
2188 /* Avoid infinite loop for the REs like "()\1+". */
2189 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2191 if (type != OP_BACK_REF)
2194 entry = mctx->bkref_ents + first_idx;
2195 enabled_idx = first_idx;
2202 re_dfastate_t *cur_state;
2204 if (entry->node != node)
2206 subexp_len = entry->subexp_to - entry->subexp_from;
2207 to_idx = str_idx + subexp_len;
2208 dst_node = (subexp_len ? dfa->nexts[node]
2209 : dfa->edests[node].elems[0]);
2211 if (to_idx > sctx->last_str_idx
2212 || sctx->sifted_states[to_idx] == NULL
2213 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2214 || check_dst_limits (mctx, &sctx->limits, node,
2215 str_idx, dst_node, to_idx))
2218 if (local_sctx.sifted_states == NULL)
2221 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2222 if (BE (err != REG_NOERROR, 0))
2225 local_sctx.last_node = node;
2226 local_sctx.last_str_idx = str_idx;
2227 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2233 cur_state = local_sctx.sifted_states[str_idx];
2234 err = sift_states_backward (mctx, &local_sctx);
2235 if (BE (err != REG_NOERROR, 0))
2237 if (sctx->limited_states != NULL)
2239 err = merge_state_array (dfa, sctx->limited_states,
2240 local_sctx.sifted_states,
2242 if (BE (err != REG_NOERROR, 0))
2245 local_sctx.sifted_states[str_idx] = cur_state;
2246 re_node_set_remove (&local_sctx.limits, enabled_idx);
2248 /* mctx->bkref_ents may have changed, reload the pointer. */
2249 entry = mctx->bkref_ents + enabled_idx;
2251 while (enabled_idx++, entry++->more);
2255 if (local_sctx.sifted_states != NULL)
2257 re_node_set_free (&local_sctx.limits);
2264 #ifdef RE_ENABLE_I18N
2267 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2268 Idx node_idx, Idx str_idx, Idx max_str_idx)
2270 const re_dfa_t *const dfa = mctx->dfa;
2272 /* Check the node can accept `multi byte'. */
2273 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2274 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2275 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2276 dfa->nexts[node_idx]))
2277 /* The node can't accept the `multi byte', or the
2278 destination was already thrown away, then the node
2279 could't accept the current input `multi byte'. */
2281 /* Otherwise, it is sure that the node could accept
2282 `naccepted' bytes input. */
2285 #endif /* RE_ENABLE_I18N */
2288 /* Functions for state transition. */
2290 /* Return the next state to which the current state STATE will transit by
2291 accepting the current input byte, and update STATE_LOG if necessary.
2292 If STATE can accept a multibyte char/collating element/back reference
2293 update the destination of STATE_LOG. */
2295 static re_dfastate_t *
2296 internal_function __attribute_warn_unused_result__
2297 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2298 re_dfastate_t *state)
2300 re_dfastate_t **trtable;
2303 #ifdef RE_ENABLE_I18N
2304 /* If the current state can accept multibyte. */
2305 if (BE (state->accept_mb, 0))
2307 *err = transit_state_mb (mctx, state);
2308 if (BE (*err != REG_NOERROR, 0))
2311 #endif /* RE_ENABLE_I18N */
2313 /* Then decide the next state with the single byte. */
2316 /* don't use transition table */
2317 return transit_state_sb (err, mctx, state);
2320 /* Use transition table */
2321 ch = re_string_fetch_byte (&mctx->input);
2324 trtable = state->trtable;
2325 if (BE (trtable != NULL, 1))
2328 trtable = state->word_trtable;
2329 if (BE (trtable != NULL, 1))
2331 unsigned int context;
2333 = re_string_context_at (&mctx->input,
2334 re_string_cur_idx (&mctx->input) - 1,
2336 if (IS_WORD_CONTEXT (context))
2337 return trtable[ch + SBC_MAX];
2342 if (!build_trtable (mctx->dfa, state))
2348 /* Retry, we now have a transition table. */
2352 /* Update the state_log if we need */
2353 static re_dfastate_t *
2355 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2356 re_dfastate_t *next_state)
2358 const re_dfa_t *const dfa = mctx->dfa;
2359 Idx cur_idx = re_string_cur_idx (&mctx->input);
2361 if (cur_idx > mctx->state_log_top)
2363 mctx->state_log[cur_idx] = next_state;
2364 mctx->state_log_top = cur_idx;
2366 else if (mctx->state_log[cur_idx] == 0)
2368 mctx->state_log[cur_idx] = next_state;
2372 re_dfastate_t *pstate;
2373 unsigned int context;
2374 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2375 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2376 the destination of a multibyte char/collating element/
2377 back reference. Then the next state is the union set of
2378 these destinations and the results of the transition table. */
2379 pstate = mctx->state_log[cur_idx];
2380 log_nodes = pstate->entrance_nodes;
2381 if (next_state != NULL)
2383 table_nodes = next_state->entrance_nodes;
2384 *err = re_node_set_init_union (&next_nodes, table_nodes,
2386 if (BE (*err != REG_NOERROR, 0))
2390 next_nodes = *log_nodes;
2391 /* Note: We already add the nodes of the initial state,
2392 then we don't need to add them here. */
2394 context = re_string_context_at (&mctx->input,
2395 re_string_cur_idx (&mctx->input) - 1,
2397 next_state = mctx->state_log[cur_idx]
2398 = re_acquire_state_context (err, dfa, &next_nodes, context);
2399 /* We don't need to check errors here, since the return value of
2400 this function is next_state and ERR is already set. */
2402 if (table_nodes != NULL)
2403 re_node_set_free (&next_nodes);
2406 if (BE (dfa->nbackref, 0) && next_state != NULL)
2408 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2409 later. We must check them here, since the back references in the
2410 next state might use them. */
2411 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2413 if (BE (*err != REG_NOERROR, 0))
2416 /* If the next state has back references. */
2417 if (next_state->has_backref)
2419 *err = transit_state_bkref (mctx, &next_state->nodes);
2420 if (BE (*err != REG_NOERROR, 0))
2422 next_state = mctx->state_log[cur_idx];
2429 /* Skip bytes in the input that correspond to part of a
2430 multi-byte match, then look in the log for a state
2431 from which to restart matching. */
2432 static re_dfastate_t *
2434 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2436 re_dfastate_t *cur_state;
2439 Idx max = mctx->state_log_top;
2440 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2444 if (++cur_str_idx > max)
2446 re_string_skip_bytes (&mctx->input, 1);
2448 while (mctx->state_log[cur_str_idx] == NULL);
2450 cur_state = merge_state_with_log (err, mctx, NULL);
2452 while (*err == REG_NOERROR && cur_state == NULL);
2456 /* Helper functions for transit_state. */
2458 /* From the node set CUR_NODES, pick up the nodes whose types are
2459 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2460 expression. And register them to use them later for evaluating the
2461 correspoding back references. */
2463 static reg_errcode_t
2465 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2468 const re_dfa_t *const dfa = mctx->dfa;
2472 /* TODO: This isn't efficient.
2473 Because there might be more than one nodes whose types are
2474 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2477 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2479 Idx node = cur_nodes->elems[node_idx];
2480 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2481 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2482 && (dfa->used_bkref_map
2483 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2485 err = match_ctx_add_subtop (mctx, node, str_idx);
2486 if (BE (err != REG_NOERROR, 0))
2494 /* Return the next state to which the current state STATE will transit by
2495 accepting the current input byte. */
2497 static re_dfastate_t *
2498 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2499 re_dfastate_t *state)
2501 const re_dfa_t *const dfa = mctx->dfa;
2502 re_node_set next_nodes;
2503 re_dfastate_t *next_state;
2504 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2505 unsigned int context;
2507 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2508 if (BE (*err != REG_NOERROR, 0))
2510 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2512 Idx cur_node = state->nodes.elems[node_cnt];
2513 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2515 *err = re_node_set_merge (&next_nodes,
2516 dfa->eclosures + dfa->nexts[cur_node]);
2517 if (BE (*err != REG_NOERROR, 0))
2519 re_node_set_free (&next_nodes);
2524 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2525 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2526 /* We don't need to check errors here, since the return value of
2527 this function is next_state and ERR is already set. */
2529 re_node_set_free (&next_nodes);
2530 re_string_skip_bytes (&mctx->input, 1);
2535 #ifdef RE_ENABLE_I18N
2536 static reg_errcode_t
2538 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2540 const re_dfa_t *const dfa = mctx->dfa;
2544 for (i = 0; i < pstate->nodes.nelem; ++i)
2546 re_node_set dest_nodes, *new_nodes;
2547 Idx cur_node_idx = pstate->nodes.elems[i];
2550 unsigned int context;
2551 re_dfastate_t *dest_state;
2553 if (!dfa->nodes[cur_node_idx].accept_mb)
2556 if (dfa->nodes[cur_node_idx].constraint)
2558 context = re_string_context_at (&mctx->input,
2559 re_string_cur_idx (&mctx->input),
2561 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2566 /* How many bytes the node can accept? */
2567 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2568 re_string_cur_idx (&mctx->input));
2572 /* The node can accepts `naccepted' bytes. */
2573 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2574 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2575 : mctx->max_mb_elem_len);
2576 err = clean_state_log_if_needed (mctx, dest_idx);
2577 if (BE (err != REG_NOERROR, 0))
2580 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2582 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2584 dest_state = mctx->state_log[dest_idx];
2585 if (dest_state == NULL)
2586 dest_nodes = *new_nodes;
2589 err = re_node_set_init_union (&dest_nodes,
2590 dest_state->entrance_nodes, new_nodes);
2591 if (BE (err != REG_NOERROR, 0))
2594 context = re_string_context_at (&mctx->input, dest_idx - 1,
2596 mctx->state_log[dest_idx]
2597 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2598 if (dest_state != NULL)
2599 re_node_set_free (&dest_nodes);
2600 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2605 #endif /* RE_ENABLE_I18N */
2607 static reg_errcode_t
2609 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2611 const re_dfa_t *const dfa = mctx->dfa;
2614 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2616 for (i = 0; i < nodes->nelem; ++i)
2618 Idx dest_str_idx, prev_nelem, bkc_idx;
2619 Idx node_idx = nodes->elems[i];
2620 unsigned int context;
2621 const re_token_t *node = dfa->nodes + node_idx;
2622 re_node_set *new_dest_nodes;
2624 /* Check whether `node' is a backreference or not. */
2625 if (node->type != OP_BACK_REF)
2628 if (node->constraint)
2630 context = re_string_context_at (&mctx->input, cur_str_idx,
2632 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2636 /* `node' is a backreference.
2637 Check the substring which the substring matched. */
2638 bkc_idx = mctx->nbkref_ents;
2639 err = get_subexp (mctx, node_idx, cur_str_idx);
2640 if (BE (err != REG_NOERROR, 0))
2643 /* And add the epsilon closures (which is `new_dest_nodes') of
2644 the backreference to appropriate state_log. */
2646 assert (dfa->nexts[node_idx] != REG_MISSING);
2648 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2651 re_dfastate_t *dest_state;
2652 struct re_backref_cache_entry *bkref_ent;
2653 bkref_ent = mctx->bkref_ents + bkc_idx;
2654 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2656 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2657 new_dest_nodes = (subexp_len == 0
2658 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2659 : dfa->eclosures + dfa->nexts[node_idx]);
2660 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2661 - bkref_ent->subexp_from);
2662 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2664 dest_state = mctx->state_log[dest_str_idx];
2665 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2666 : mctx->state_log[cur_str_idx]->nodes.nelem);
2667 /* Add `new_dest_node' to state_log. */
2668 if (dest_state == NULL)
2670 mctx->state_log[dest_str_idx]
2671 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2673 if (BE (mctx->state_log[dest_str_idx] == NULL
2674 && err != REG_NOERROR, 0))
2679 re_node_set dest_nodes;
2680 err = re_node_set_init_union (&dest_nodes,
2681 dest_state->entrance_nodes,
2683 if (BE (err != REG_NOERROR, 0))
2685 re_node_set_free (&dest_nodes);
2688 mctx->state_log[dest_str_idx]
2689 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2690 re_node_set_free (&dest_nodes);
2691 if (BE (mctx->state_log[dest_str_idx] == NULL
2692 && err != REG_NOERROR, 0))
2695 /* We need to check recursively if the backreference can epsilon
2698 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2700 err = check_subexp_matching_top (mctx, new_dest_nodes,
2702 if (BE (err != REG_NOERROR, 0))
2704 err = transit_state_bkref (mctx, new_dest_nodes);
2705 if (BE (err != REG_NOERROR, 0))
2715 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2716 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2717 Note that we might collect inappropriate candidates here.
2718 However, the cost of checking them strictly here is too high, then we
2719 delay these checking for prune_impossible_nodes(). */
2721 static reg_errcode_t
2722 internal_function __attribute_warn_unused_result__
2723 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2725 const re_dfa_t *const dfa = mctx->dfa;
2726 Idx subexp_num, sub_top_idx;
2727 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2728 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2729 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2730 if (cache_idx != REG_MISSING)
2732 const struct re_backref_cache_entry *entry
2733 = mctx->bkref_ents + cache_idx;
2735 if (entry->node == bkref_node)
2736 return REG_NOERROR; /* We already checked it. */
2737 while (entry++->more);
2740 subexp_num = dfa->nodes[bkref_node].opr.idx;
2742 /* For each sub expression */
2743 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2746 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2747 re_sub_match_last_t *sub_last;
2748 Idx sub_last_idx, sl_str, bkref_str_off;
2750 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2751 continue; /* It isn't related. */
2753 sl_str = sub_top->str_idx;
2754 bkref_str_off = bkref_str_idx;
2755 /* At first, check the last node of sub expressions we already
2757 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2759 regoff_t sl_str_diff;
2760 sub_last = sub_top->lasts[sub_last_idx];
2761 sl_str_diff = sub_last->str_idx - sl_str;
2762 /* The matched string by the sub expression match with the substring
2763 at the back reference? */
2764 if (sl_str_diff > 0)
2766 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2768 /* Not enough chars for a successful match. */
2769 if (bkref_str_off + sl_str_diff > mctx->input.len)
2772 err = clean_state_log_if_needed (mctx,
2775 if (BE (err != REG_NOERROR, 0))
2777 buf = (const char *) re_string_get_buffer (&mctx->input);
2779 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2780 /* We don't need to search this sub expression any more. */
2783 bkref_str_off += sl_str_diff;
2784 sl_str += sl_str_diff;
2785 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2788 /* Reload buf, since the preceding call might have reallocated
2790 buf = (const char *) re_string_get_buffer (&mctx->input);
2792 if (err == REG_NOMATCH)
2794 if (BE (err != REG_NOERROR, 0))
2798 if (sub_last_idx < sub_top->nlasts)
2800 if (sub_last_idx > 0)
2802 /* Then, search for the other last nodes of the sub expression. */
2803 for (; sl_str <= bkref_str_idx; ++sl_str)
2806 regoff_t sl_str_off;
2807 const re_node_set *nodes;
2808 sl_str_off = sl_str - sub_top->str_idx;
2809 /* The matched string by the sub expression match with the substring
2810 at the back reference? */
2813 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2815 /* If we are at the end of the input, we cannot match. */
2816 if (bkref_str_off >= mctx->input.len)
2819 err = extend_buffers (mctx);
2820 if (BE (err != REG_NOERROR, 0))
2823 buf = (const char *) re_string_get_buffer (&mctx->input);
2825 if (buf [bkref_str_off++] != buf[sl_str - 1])
2826 break; /* We don't need to search this sub expression
2829 if (mctx->state_log[sl_str] == NULL)
2831 /* Does this state have a ')' of the sub expression? */
2832 nodes = &mctx->state_log[sl_str]->nodes;
2833 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2835 if (cls_node == REG_MISSING)
2837 if (sub_top->path == NULL)
2839 sub_top->path = calloc (sizeof (state_array_t),
2840 sl_str - sub_top->str_idx + 1);
2841 if (sub_top->path == NULL)
2844 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2845 in the current context? */
2846 err = check_arrival (mctx, sub_top->path, sub_top->node,
2847 sub_top->str_idx, cls_node, sl_str,
2849 if (err == REG_NOMATCH)
2851 if (BE (err != REG_NOERROR, 0))
2853 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2854 if (BE (sub_last == NULL, 0))
2856 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2858 if (err == REG_NOMATCH)
2865 /* Helper functions for get_subexp(). */
2867 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2868 If it can arrive, register the sub expression expressed with SUB_TOP
2871 static reg_errcode_t
2873 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2874 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2878 /* Can the subexpression arrive the back reference? */
2879 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2880 sub_last->str_idx, bkref_node, bkref_str,
2882 if (err != REG_NOERROR)
2884 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2886 if (BE (err != REG_NOERROR, 0))
2888 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2889 return clean_state_log_if_needed (mctx, to_idx);
2892 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2893 Search '(' if FL_OPEN, or search ')' otherwise.
2894 TODO: This function isn't efficient...
2895 Because there might be more than one nodes whose types are
2896 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2902 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2903 Idx subexp_idx, int type)
2906 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2908 Idx cls_node = nodes->elems[cls_idx];
2909 const re_token_t *node = dfa->nodes + cls_node;
2910 if (node->type == type
2911 && node->opr.idx == subexp_idx)
2917 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2918 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2920 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2922 static reg_errcode_t
2923 internal_function __attribute_warn_unused_result__
2924 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2925 Idx top_str, Idx last_node, Idx last_str, int type)
2927 const re_dfa_t *const dfa = mctx->dfa;
2928 reg_errcode_t err = REG_NOERROR;
2929 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2930 re_dfastate_t *cur_state = NULL;
2931 re_node_set *cur_nodes, next_nodes;
2932 re_dfastate_t **backup_state_log;
2933 unsigned int context;
2935 subexp_num = dfa->nodes[top_node].opr.idx;
2936 /* Extend the buffer if we need. */
2937 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2939 re_dfastate_t **new_array;
2940 Idx old_alloc = path->alloc;
2941 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2942 if (BE (new_alloc < old_alloc, 0)
2943 || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2945 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2946 if (BE (new_array == NULL, 0))
2948 path->array = new_array;
2949 path->alloc = new_alloc;
2950 memset (new_array + old_alloc, '\0',
2951 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2954 str_idx = path->next_idx ? path->next_idx : top_str;
2956 /* Temporary modify MCTX. */
2957 backup_state_log = mctx->state_log;
2958 backup_cur_idx = mctx->input.cur_idx;
2959 mctx->state_log = path->array;
2960 mctx->input.cur_idx = str_idx;
2962 /* Setup initial node set. */
2963 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2964 if (str_idx == top_str)
2966 err = re_node_set_init_1 (&next_nodes, top_node);
2967 if (BE (err != REG_NOERROR, 0))
2969 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2970 if (BE (err != REG_NOERROR, 0))
2972 re_node_set_free (&next_nodes);
2978 cur_state = mctx->state_log[str_idx];
2979 if (cur_state && cur_state->has_backref)
2981 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2982 if (BE (err != REG_NOERROR, 0))
2986 re_node_set_init_empty (&next_nodes);
2988 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2990 if (next_nodes.nelem)
2992 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2994 if (BE (err != REG_NOERROR, 0))
2996 re_node_set_free (&next_nodes);
3000 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3001 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3003 re_node_set_free (&next_nodes);
3006 mctx->state_log[str_idx] = cur_state;
3009 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3011 re_node_set_empty (&next_nodes);
3012 if (mctx->state_log[str_idx + 1])
3014 err = re_node_set_merge (&next_nodes,
3015 &mctx->state_log[str_idx + 1]->nodes);
3016 if (BE (err != REG_NOERROR, 0))
3018 re_node_set_free (&next_nodes);
3024 err = check_arrival_add_next_nodes (mctx, str_idx,
3025 &cur_state->non_eps_nodes,
3027 if (BE (err != REG_NOERROR, 0))
3029 re_node_set_free (&next_nodes);
3034 if (next_nodes.nelem)
3036 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3037 if (BE (err != REG_NOERROR, 0))
3039 re_node_set_free (&next_nodes);
3042 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3044 if (BE (err != REG_NOERROR, 0))
3046 re_node_set_free (&next_nodes);
3050 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3051 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3052 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3054 re_node_set_free (&next_nodes);
3057 mctx->state_log[str_idx] = cur_state;
3058 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3060 re_node_set_free (&next_nodes);
3061 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3062 : &mctx->state_log[last_str]->nodes);
3063 path->next_idx = str_idx;
3066 mctx->state_log = backup_state_log;
3067 mctx->input.cur_idx = backup_cur_idx;
3069 /* Then check the current node set has the node LAST_NODE. */
3070 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3076 /* Helper functions for check_arrival. */
3078 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3080 TODO: This function is similar to the functions transit_state*(),
3081 however this function has many additional works.
3082 Can't we unify them? */
3084 static reg_errcode_t
3085 internal_function __attribute_warn_unused_result__
3086 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3087 re_node_set *cur_nodes, re_node_set *next_nodes)
3089 const re_dfa_t *const dfa = mctx->dfa;
3092 #ifdef RE_ENABLE_I18N
3093 reg_errcode_t err = REG_NOERROR;
3095 re_node_set union_set;
3096 re_node_set_init_empty (&union_set);
3097 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3100 Idx cur_node = cur_nodes->elems[cur_idx];
3102 re_token_type_t type = dfa->nodes[cur_node].type;
3103 assert (!IS_EPSILON_NODE (type));
3105 #ifdef RE_ENABLE_I18N
3106 /* If the node may accept `multi byte'. */
3107 if (dfa->nodes[cur_node].accept_mb)
3109 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3113 re_dfastate_t *dest_state;
3114 Idx next_node = dfa->nexts[cur_node];
3115 Idx next_idx = str_idx + naccepted;
3116 dest_state = mctx->state_log[next_idx];
3117 re_node_set_empty (&union_set);
3120 err = re_node_set_merge (&union_set, &dest_state->nodes);
3121 if (BE (err != REG_NOERROR, 0))
3123 re_node_set_free (&union_set);
3127 ok = re_node_set_insert (&union_set, next_node);
3130 re_node_set_free (&union_set);
3133 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3135 if (BE (mctx->state_log[next_idx] == NULL
3136 && err != REG_NOERROR, 0))
3138 re_node_set_free (&union_set);
3143 #endif /* RE_ENABLE_I18N */
3145 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3147 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3150 re_node_set_free (&union_set);
3155 re_node_set_free (&union_set);
3159 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3160 CUR_NODES, however exclude the nodes which are:
3161 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3162 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3165 static reg_errcode_t
3167 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3168 Idx ex_subexp, int type)
3171 Idx idx, outside_node;
3172 re_node_set new_nodes;
3174 assert (cur_nodes->nelem);
3176 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3177 if (BE (err != REG_NOERROR, 0))
3179 /* Create a new node set NEW_NODES with the nodes which are epsilon
3180 closures of the node in CUR_NODES. */
3182 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3184 Idx cur_node = cur_nodes->elems[idx];
3185 const re_node_set *eclosure = dfa->eclosures + cur_node;
3186 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3187 if (outside_node == REG_MISSING)
3189 /* There are no problematic nodes, just merge them. */
3190 err = re_node_set_merge (&new_nodes, eclosure);
3191 if (BE (err != REG_NOERROR, 0))
3193 re_node_set_free (&new_nodes);
3199 /* There are problematic nodes, re-calculate incrementally. */
3200 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3202 if (BE (err != REG_NOERROR, 0))
3204 re_node_set_free (&new_nodes);
3209 re_node_set_free (cur_nodes);
3210 *cur_nodes = new_nodes;
3214 /* Helper function for check_arrival_expand_ecl.
3215 Check incrementally the epsilon closure of TARGET, and if it isn't
3216 problematic append it to DST_NODES. */
3218 static reg_errcode_t
3219 internal_function __attribute_warn_unused_result__
3220 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3221 Idx target, Idx ex_subexp, int type)
3224 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3228 if (dfa->nodes[cur_node].type == type
3229 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3231 if (type == OP_CLOSE_SUBEXP)
3233 ok = re_node_set_insert (dst_nodes, cur_node);
3239 ok = re_node_set_insert (dst_nodes, cur_node);
3242 if (dfa->edests[cur_node].nelem == 0)
3244 if (dfa->edests[cur_node].nelem == 2)
3247 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3248 dfa->edests[cur_node].elems[1],
3250 if (BE (err != REG_NOERROR, 0))
3253 cur_node = dfa->edests[cur_node].elems[0];
3259 /* For all the back references in the current state, calculate the
3260 destination of the back references by the appropriate entry
3261 in MCTX->BKREF_ENTS. */
3263 static reg_errcode_t
3264 internal_function __attribute_warn_unused_result__
3265 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3266 Idx cur_str, Idx subexp_num, int type)
3268 const re_dfa_t *const dfa = mctx->dfa;
3270 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3271 struct re_backref_cache_entry *ent;
3273 if (cache_idx_start == REG_MISSING)
3277 ent = mctx->bkref_ents + cache_idx_start;
3280 Idx to_idx, next_node;
3282 /* Is this entry ENT is appropriate? */
3283 if (!re_node_set_contains (cur_nodes, ent->node))
3286 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3287 /* Calculate the destination of the back reference, and append it
3288 to MCTX->STATE_LOG. */
3289 if (to_idx == cur_str)
3291 /* The backreference did epsilon transit, we must re-check all the
3292 node in the current state. */
3293 re_node_set new_dests;
3294 reg_errcode_t err2, err3;
3295 next_node = dfa->edests[ent->node].elems[0];
3296 if (re_node_set_contains (cur_nodes, next_node))
3298 err = re_node_set_init_1 (&new_dests, next_node);
3299 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3300 err3 = re_node_set_merge (cur_nodes, &new_dests);
3301 re_node_set_free (&new_dests);
3302 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3303 || err3 != REG_NOERROR, 0))
3305 err = (err != REG_NOERROR ? err
3306 : (err2 != REG_NOERROR ? err2 : err3));
3309 /* TODO: It is still inefficient... */
3314 re_node_set union_set;
3315 next_node = dfa->nexts[ent->node];
3316 if (mctx->state_log[to_idx])
3319 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3322 err = re_node_set_init_copy (&union_set,
3323 &mctx->state_log[to_idx]->nodes);
3324 ok = re_node_set_insert (&union_set, next_node);
3325 if (BE (err != REG_NOERROR || ! ok, 0))
3327 re_node_set_free (&union_set);
3328 err = err != REG_NOERROR ? err : REG_ESPACE;
3334 err = re_node_set_init_1 (&union_set, next_node);
3335 if (BE (err != REG_NOERROR, 0))
3338 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3339 re_node_set_free (&union_set);
3340 if (BE (mctx->state_log[to_idx] == NULL
3341 && err != REG_NOERROR, 0))
3345 while (ent++->more);
3349 /* Build transition table for the state.
3350 Return true if successful. */
3354 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3359 bool need_word_trtable = false;
3360 bitset_word_t elem, mask;
3361 bool dests_node_malloced = false;
3362 bool dest_states_malloced = false;
3363 Idx ndests; /* Number of the destination states from `state'. */
3364 re_dfastate_t **trtable;
3365 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3366 re_node_set follows, *dests_node;
3368 bitset_t acceptable;
3372 re_node_set dests_node[SBC_MAX];
3373 bitset_t dests_ch[SBC_MAX];
3376 /* We build DFA states which corresponds to the destination nodes
3377 from `state'. `dests_node[i]' represents the nodes which i-th
3378 destination state contains, and `dests_ch[i]' represents the
3379 characters which i-th destination state accepts. */
3380 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3381 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3384 dests_alloc = re_malloc (struct dests_alloc, 1);
3385 if (BE (dests_alloc == NULL, 0))
3387 dests_node_malloced = true;
3389 dests_node = dests_alloc->dests_node;
3390 dests_ch = dests_alloc->dests_ch;
3392 /* Initialize transiton table. */
3393 state->word_trtable = state->trtable = NULL;
3395 /* At first, group all nodes belonging to `state' into several
3397 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3398 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3400 if (dests_node_malloced)
3404 state->trtable = (re_dfastate_t **)
3405 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3406 if (BE (state->trtable == NULL, 0))
3413 err = re_node_set_alloc (&follows, ndests + 1);
3414 if (BE (err != REG_NOERROR, 0))
3417 /* Avoid arithmetic overflow in size calculation. */
3418 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3419 / (3 * sizeof (re_dfastate_t *)))
3424 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3425 + ndests * 3 * sizeof (re_dfastate_t *)))
3426 dest_states = (re_dfastate_t **)
3427 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3430 dest_states = (re_dfastate_t **)
3431 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3432 if (BE (dest_states == NULL, 0))
3435 if (dest_states_malloced)
3437 re_node_set_free (&follows);
3438 for (i = 0; i < ndests; ++i)
3439 re_node_set_free (dests_node + i);
3440 if (dests_node_malloced)
3444 dest_states_malloced = true;
3446 dest_states_word = dest_states + ndests;
3447 dest_states_nl = dest_states_word + ndests;
3448 bitset_empty (acceptable);
3450 /* Then build the states for all destinations. */
3451 for (i = 0; i < ndests; ++i)
3454 re_node_set_empty (&follows);
3455 /* Merge the follows of this destination states. */
3456 for (j = 0; j < dests_node[i].nelem; ++j)
3458 next_node = dfa->nexts[dests_node[i].elems[j]];
3459 if (next_node != REG_MISSING)
3461 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3462 if (BE (err != REG_NOERROR, 0))
3466 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3467 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3469 /* If the new state has context constraint,
3470 build appropriate states for these contexts. */
3471 if (dest_states[i]->has_constraint)
3473 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3475 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3478 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3479 need_word_trtable = true;
3481 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3483 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3488 dest_states_word[i] = dest_states[i];
3489 dest_states_nl[i] = dest_states[i];
3491 bitset_merge (acceptable, dests_ch[i]);
3494 if (!BE (need_word_trtable, 0))
3496 /* We don't care about whether the following character is a word
3497 character, or we are in a single-byte character set so we can
3498 discern by looking at the character code: allocate a
3499 256-entry transition table. */
3500 trtable = state->trtable =
3501 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3502 if (BE (trtable == NULL, 0))
3505 /* For all characters ch...: */
3506 for (i = 0; i < BITSET_WORDS; ++i)
3507 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3509 mask <<= 1, elem >>= 1, ++ch)
3510 if (BE (elem & 1, 0))
3512 /* There must be exactly one destination which accepts
3513 character ch. See group_nodes_into_DFAstates. */
3514 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3517 /* j-th destination accepts the word character ch. */
3518 if (dfa->word_char[i] & mask)
3519 trtable[ch] = dest_states_word[j];
3521 trtable[ch] = dest_states[j];
3526 /* We care about whether the following character is a word
3527 character, and we are in a multi-byte character set: discern
3528 by looking at the character code: build two 256-entry
3529 transition tables, one starting at trtable[0] and one
3530 starting at trtable[SBC_MAX]. */
3531 trtable = state->word_trtable =
3532 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3533 if (BE (trtable == NULL, 0))
3536 /* For all characters ch...: */
3537 for (i = 0; i < BITSET_WORDS; ++i)
3538 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3540 mask <<= 1, elem >>= 1, ++ch)
3541 if (BE (elem & 1, 0))
3543 /* There must be exactly one destination which accepts
3544 character ch. See group_nodes_into_DFAstates. */
3545 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3548 /* j-th destination accepts the word character ch. */
3549 trtable[ch] = dest_states[j];
3550 trtable[ch + SBC_MAX] = dest_states_word[j];
3555 if (bitset_contain (acceptable, NEWLINE_CHAR))
3557 /* The current state accepts newline character. */
3558 for (j = 0; j < ndests; ++j)
3559 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3561 /* k-th destination accepts newline character. */
3562 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3563 if (need_word_trtable)
3564 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3565 /* There must be only one destination which accepts
3566 newline. See group_nodes_into_DFAstates. */
3571 if (dest_states_malloced)
3574 re_node_set_free (&follows);
3575 for (i = 0; i < ndests; ++i)
3576 re_node_set_free (dests_node + i);
3578 if (dests_node_malloced)
3584 /* Group all nodes belonging to STATE into several destinations.
3585 Then for all destinations, set the nodes belonging to the destination
3586 to DESTS_NODE[i] and set the characters accepted by the destination
3587 to DEST_CH[i]. This function return the number of destinations. */
3591 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3592 re_node_set *dests_node, bitset_t *dests_ch)
3597 Idx ndests; /* Number of the destinations from `state'. */
3598 bitset_t accepts; /* Characters a node can accept. */
3599 const re_node_set *cur_nodes = &state->nodes;
3600 bitset_empty (accepts);
3603 /* For all the nodes belonging to `state', */
3604 for (i = 0; i < cur_nodes->nelem; ++i)
3606 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3607 re_token_type_t type = node->type;
3608 unsigned int constraint = node->constraint;
3610 /* Enumerate all single byte character this node can accept. */
3611 if (type == CHARACTER)
3612 bitset_set (accepts, node->opr.c);
3613 else if (type == SIMPLE_BRACKET)
3615 bitset_merge (accepts, node->opr.sbcset);
3617 else if (type == OP_PERIOD)
3619 #ifdef RE_ENABLE_I18N
3620 if (dfa->mb_cur_max > 1)
3621 bitset_merge (accepts, dfa->sb_char);
3624 bitset_set_all (accepts);
3625 if (!(dfa->syntax & RE_DOT_NEWLINE))
3626 bitset_clear (accepts, '\n');
3627 if (dfa->syntax & RE_DOT_NOT_NULL)
3628 bitset_clear (accepts, '\0');
3630 #ifdef RE_ENABLE_I18N
3631 else if (type == OP_UTF8_PERIOD)
3633 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3634 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3636 bitset_merge (accepts, utf8_sb_map);
3637 if (!(dfa->syntax & RE_DOT_NEWLINE))
3638 bitset_clear (accepts, '\n');
3639 if (dfa->syntax & RE_DOT_NOT_NULL)
3640 bitset_clear (accepts, '\0');
3646 /* Check the `accepts' and sift the characters which are not
3647 match it the context. */
3650 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3652 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3653 bitset_empty (accepts);
3654 if (accepts_newline)
3655 bitset_set (accepts, NEWLINE_CHAR);
3659 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3661 bitset_empty (accepts);
3665 if (constraint & NEXT_WORD_CONSTRAINT)
3667 bitset_word_t any_set = 0;
3668 if (type == CHARACTER && !node->word_char)
3670 bitset_empty (accepts);
3673 #ifdef RE_ENABLE_I18N
3674 if (dfa->mb_cur_max > 1)
3675 for (j = 0; j < BITSET_WORDS; ++j)
3676 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3679 for (j = 0; j < BITSET_WORDS; ++j)
3680 any_set |= (accepts[j] &= dfa->word_char[j]);
3684 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3686 bitset_word_t any_set = 0;
3687 if (type == CHARACTER && node->word_char)
3689 bitset_empty (accepts);
3692 #ifdef RE_ENABLE_I18N
3693 if (dfa->mb_cur_max > 1)
3694 for (j = 0; j < BITSET_WORDS; ++j)
3695 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3698 for (j = 0; j < BITSET_WORDS; ++j)
3699 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3705 /* Then divide `accepts' into DFA states, or create a new
3706 state. Above, we make sure that accepts is not empty. */
3707 for (j = 0; j < ndests; ++j)
3709 bitset_t intersec; /* Intersection sets, see below. */
3711 /* Flags, see below. */
3712 bitset_word_t has_intersec, not_subset, not_consumed;
3714 /* Optimization, skip if this state doesn't accept the character. */
3715 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3718 /* Enumerate the intersection set of this state and `accepts'. */
3720 for (k = 0; k < BITSET_WORDS; ++k)
3721 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3722 /* And skip if the intersection set is empty. */
3726 /* Then check if this state is a subset of `accepts'. */
3727 not_subset = not_consumed = 0;
3728 for (k = 0; k < BITSET_WORDS; ++k)
3730 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3731 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3734 /* If this state isn't a subset of `accepts', create a
3735 new group state, which has the `remains'. */
3738 bitset_copy (dests_ch[ndests], remains);
3739 bitset_copy (dests_ch[j], intersec);
3740 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3741 if (BE (err != REG_NOERROR, 0))
3746 /* Put the position in the current group. */
3747 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3751 /* If all characters are consumed, go to next node. */
3755 /* Some characters remain, create a new group. */
3758 bitset_copy (dests_ch[ndests], accepts);
3759 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3760 if (BE (err != REG_NOERROR, 0))
3763 bitset_empty (accepts);
3768 for (j = 0; j < ndests; ++j)
3769 re_node_set_free (dests_node + j);
3773 #ifdef RE_ENABLE_I18N
3774 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3775 Return the number of the bytes the node accepts.
3776 STR_IDX is the current index of the input string.
3778 This function handles the nodes which can accept one character, or
3779 one collating element like '.', '[a-z]', opposite to the other nodes
3780 can only accept one byte. */
3784 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3785 const re_string_t *input, Idx str_idx)
3787 const re_token_t *node = dfa->nodes + node_idx;
3788 int char_len, elem_len;
3791 if (BE (node->type == OP_UTF8_PERIOD, 0))
3793 unsigned char c = re_string_byte_at (input, str_idx), d;
3794 if (BE (c < 0xc2, 1))
3797 if (str_idx + 2 > input->len)
3800 d = re_string_byte_at (input, str_idx + 1);
3802 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3806 if (c == 0xe0 && d < 0xa0)
3812 if (c == 0xf0 && d < 0x90)
3818 if (c == 0xf8 && d < 0x88)
3824 if (c == 0xfc && d < 0x84)
3830 if (str_idx + char_len > input->len)
3833 for (i = 1; i < char_len; ++i)
3835 d = re_string_byte_at (input, str_idx + i);
3836 if (d < 0x80 || d > 0xbf)
3842 char_len = re_string_char_size_at (input, str_idx);
3843 if (node->type == OP_PERIOD)
3847 /* FIXME: I don't think this if is needed, as both '\n'
3848 and '\0' are char_len == 1. */
3849 /* '.' accepts any one character except the following two cases. */
3850 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3851 re_string_byte_at (input, str_idx) == '\n') ||
3852 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3853 re_string_byte_at (input, str_idx) == '\0'))
3858 elem_len = re_string_elem_size_at (input, str_idx);
3859 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3862 if (node->type == COMPLEX_BRACKET)
3864 const re_charset_t *cset = node->opr.mbcset;
3866 const unsigned char *pin
3867 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3872 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3873 ? re_string_wchar_at (input, str_idx) : 0);
3875 /* match with multibyte character? */
3876 for (i = 0; i < cset->nmbchars; ++i)
3877 if (wc == cset->mbchars[i])
3879 match_len = char_len;
3880 goto check_node_accept_bytes_match;
3882 /* match with character_class? */
3883 for (i = 0; i < cset->nchar_classes; ++i)
3885 wctype_t wt = cset->char_classes[i];
3886 if (__iswctype (wc, wt))
3888 match_len = char_len;
3889 goto check_node_accept_bytes_match;
3894 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3897 unsigned int in_collseq = 0;
3898 const int32_t *table, *indirect;
3899 const unsigned char *weights, *extra;
3900 const char *collseqwc;
3902 /* This #include defines a local function! */
3903 # include <locale/weight.h>
3905 /* match with collating_symbol? */
3906 if (cset->ncoll_syms)
3907 extra = (const unsigned char *)
3908 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3909 for (i = 0; i < cset->ncoll_syms; ++i)
3911 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3912 /* Compare the length of input collating element and
3913 the length of current collating element. */
3914 if (*coll_sym != elem_len)
3916 /* Compare each bytes. */
3917 for (j = 0; j < *coll_sym; j++)
3918 if (pin[j] != coll_sym[1 + j])
3922 /* Match if every bytes is equal. */
3924 goto check_node_accept_bytes_match;
3930 if (elem_len <= char_len)
3932 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3933 in_collseq = __collseq_table_lookup (collseqwc, wc);
3936 in_collseq = find_collation_sequence_value (pin, elem_len);
3938 /* match with range expression? */
3939 for (i = 0; i < cset->nranges; ++i)
3940 if (cset->range_starts[i] <= in_collseq
3941 && in_collseq <= cset->range_ends[i])
3943 match_len = elem_len;
3944 goto check_node_accept_bytes_match;
3947 /* match with equivalence_class? */
3948 if (cset->nequiv_classes)
3950 const unsigned char *cp = pin;
3951 table = (const int32_t *)
3952 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3953 weights = (const unsigned char *)
3954 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3955 extra = (const unsigned char *)
3956 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3957 indirect = (const int32_t *)
3958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3959 int32_t idx = findidx (&cp);
3961 for (i = 0; i < cset->nequiv_classes; ++i)
3963 int32_t equiv_class_idx = cset->equiv_classes[i];
3964 size_t weight_len = weights[idx & 0xffffff];
3965 if (weight_len == weights[equiv_class_idx & 0xffffff]
3966 && (idx >> 24) == (equiv_class_idx >> 24))
3971 equiv_class_idx &= 0xffffff;
3973 while (cnt <= weight_len
3974 && (weights[equiv_class_idx + 1 + cnt]
3975 == weights[idx + 1 + cnt]))
3977 if (cnt > weight_len)
3979 match_len = elem_len;
3980 goto check_node_accept_bytes_match;
3989 /* match with range expression? */
3990 #if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__)
3991 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3993 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3996 for (i = 0; i < cset->nranges; ++i)
3998 cmp_buf[0] = cset->range_starts[i];
3999 cmp_buf[4] = cset->range_ends[i];
4000 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
4001 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
4003 match_len = char_len;
4004 goto check_node_accept_bytes_match;
4008 check_node_accept_bytes_match:
4009 if (!cset->non_match)
4016 return (elem_len > char_len) ? elem_len : char_len;
4025 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4027 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4032 /* No valid character. Match it as a single byte character. */
4033 const unsigned char *collseq = (const unsigned char *)
4034 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4035 return collseq[mbs[0]];
4042 const unsigned char *extra = (const unsigned char *)
4043 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4044 int32_t extrasize = (const unsigned char *)
4045 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4047 for (idx = 0; idx < extrasize;)
4051 int32_t elem_mbs_len;
4052 /* Skip the name of collating element name. */
4053 idx = idx + extra[idx] + 1;
4054 elem_mbs_len = extra[idx++];
4055 if (mbs_len == elem_mbs_len)
4057 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4058 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4060 if (mbs_cnt == elem_mbs_len)
4061 /* Found the entry. */
4064 /* Skip the byte sequence of the collating element. */
4065 idx += elem_mbs_len;
4066 /* Adjust for the alignment. */
4067 idx = (idx + 3) & ~3;
4068 /* Skip the collation sequence value. */
4069 idx += sizeof (uint32_t);
4070 /* Skip the wide char sequence of the collating element. */
4071 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4072 /* If we found the entry, return the sequence value. */
4074 return *(uint32_t *) (extra + idx);
4075 /* Skip the collation sequence value. */
4076 idx += sizeof (uint32_t);
4082 #endif /* RE_ENABLE_I18N */
4084 /* Check whether the node accepts the byte which is IDX-th
4085 byte of the INPUT. */
4089 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4093 ch = re_string_byte_at (&mctx->input, idx);
4097 if (node->opr.c != ch)
4101 case SIMPLE_BRACKET:
4102 if (!bitset_contain (node->opr.sbcset, ch))
4106 #ifdef RE_ENABLE_I18N
4107 case OP_UTF8_PERIOD:
4108 if (ch >= ASCII_CHARS)
4113 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4114 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4122 if (node->constraint)
4124 /* The node has constraints. Check whether the current context
4125 satisfies the constraints. */
4126 unsigned int context = re_string_context_at (&mctx->input, idx,
4128 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4135 /* Extend the buffers, if the buffers have run out. */
4137 static reg_errcode_t
4138 internal_function __attribute_warn_unused_result__
4139 extend_buffers (re_match_context_t *mctx)
4142 re_string_t *pstr = &mctx->input;
4144 /* Avoid overflow. */
4145 if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4148 /* Double the lengthes of the buffers. */
4149 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4150 if (BE (ret != REG_NOERROR, 0))
4153 if (mctx->state_log != NULL)
4155 /* And double the length of state_log. */
4156 /* XXX We have no indication of the size of this buffer. If this
4157 allocation fail we have no indication that the state_log array
4158 does not have the right size. */
4159 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4160 pstr->bufs_len + 1);
4161 if (BE (new_array == NULL, 0))
4163 mctx->state_log = new_array;
4166 /* Then reconstruct the buffers. */
4169 #ifdef RE_ENABLE_I18N
4170 if (pstr->mb_cur_max > 1)
4172 ret = build_wcs_upper_buffer (pstr);
4173 if (BE (ret != REG_NOERROR, 0))
4177 #endif /* RE_ENABLE_I18N */
4178 build_upper_buffer (pstr);
4182 #ifdef RE_ENABLE_I18N
4183 if (pstr->mb_cur_max > 1)
4184 build_wcs_buffer (pstr);
4186 #endif /* RE_ENABLE_I18N */
4188 if (pstr->trans != NULL)
4189 re_string_translate_buffer (pstr);
4196 /* Functions for matching context. */
4198 /* Initialize MCTX. */
4200 static reg_errcode_t
4201 internal_function __attribute_warn_unused_result__
4202 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4204 mctx->eflags = eflags;
4205 mctx->match_last = REG_MISSING;
4208 /* Avoid overflow. */
4209 size_t max_object_size =
4210 MAX (sizeof (struct re_backref_cache_entry),
4211 sizeof (re_sub_match_top_t *));
4212 if (BE (SIZE_MAX / max_object_size < n, 0))
4215 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4216 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4217 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4220 /* Already zero-ed by the caller.
4222 mctx->bkref_ents = NULL;
4223 mctx->nbkref_ents = 0;
4224 mctx->nsub_tops = 0; */
4225 mctx->abkref_ents = n;
4226 mctx->max_mb_elem_len = 1;
4227 mctx->asub_tops = n;
4231 /* Clean the entries which depend on the current input in MCTX.
4232 This function must be invoked when the matcher changes the start index
4233 of the input, or changes the input string. */
4237 match_ctx_clean (re_match_context_t *mctx)
4240 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4243 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4244 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4246 re_sub_match_last_t *last = top->lasts[sl_idx];
4247 re_free (last->path.array);
4250 re_free (top->lasts);
4253 re_free (top->path->array);
4254 re_free (top->path);
4259 mctx->nsub_tops = 0;
4260 mctx->nbkref_ents = 0;
4263 /* Free all the memory associated with MCTX. */
4267 match_ctx_free (re_match_context_t *mctx)
4269 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4270 match_ctx_clean (mctx);
4271 re_free (mctx->sub_tops);
4272 re_free (mctx->bkref_ents);
4275 /* Add a new backreference entry to MCTX.
4276 Note that we assume that caller never call this function with duplicate
4277 entry, and call with STR_IDX which isn't smaller than any existing entry.
4280 static reg_errcode_t
4281 internal_function __attribute_warn_unused_result__
4282 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4285 if (mctx->nbkref_ents >= mctx->abkref_ents)
4287 struct re_backref_cache_entry* new_entry;
4288 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4289 mctx->abkref_ents * 2);
4290 if (BE (new_entry == NULL, 0))
4292 re_free (mctx->bkref_ents);
4295 mctx->bkref_ents = new_entry;
4296 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4297 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4298 mctx->abkref_ents *= 2;
4300 if (mctx->nbkref_ents > 0
4301 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4302 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4304 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4305 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4306 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4307 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4309 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4310 If bit N is clear, means that this entry won't epsilon-transition to
4311 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4312 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4315 A backreference does not epsilon-transition unless it is empty, so set
4316 to all zeros if FROM != TO. */
4317 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4318 = (from == to ? -1 : 0);
4320 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4321 if (mctx->max_mb_elem_len < to - from)
4322 mctx->max_mb_elem_len = to - from;
4326 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4327 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4331 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4333 Idx left, right, mid, last;
4334 last = right = mctx->nbkref_ents;
4335 for (left = 0; left < right;)
4337 mid = (left + right) / 2;
4338 if (mctx->bkref_ents[mid].str_idx < str_idx)
4343 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4349 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4352 static reg_errcode_t
4353 internal_function __attribute_warn_unused_result__
4354 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4357 assert (mctx->sub_tops != NULL);
4358 assert (mctx->asub_tops > 0);
4360 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4362 Idx new_asub_tops = mctx->asub_tops * 2;
4363 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4364 re_sub_match_top_t *,
4366 if (BE (new_array == NULL, 0))
4368 mctx->sub_tops = new_array;
4369 mctx->asub_tops = new_asub_tops;
4371 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4372 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4374 mctx->sub_tops[mctx->nsub_tops]->node = node;
4375 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4379 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4380 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4382 static re_sub_match_last_t *
4384 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4386 re_sub_match_last_t *new_entry;
4387 if (BE (subtop->nlasts == subtop->alasts, 0))
4389 Idx new_alasts = 2 * subtop->alasts + 1;
4390 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4391 re_sub_match_last_t *,
4393 if (BE (new_array == NULL, 0))
4395 subtop->lasts = new_array;
4396 subtop->alasts = new_alasts;
4398 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4399 if (BE (new_entry != NULL, 1))
4401 subtop->lasts[subtop->nlasts] = new_entry;
4402 new_entry->node = node;
4403 new_entry->str_idx = str_idx;
4411 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4412 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4414 sctx->sifted_states = sifted_sts;
4415 sctx->limited_states = limited_sts;
4416 sctx->last_node = last_node;
4417 sctx->last_str_idx = last_str_idx;
4418 re_node_set_init_empty (&sctx->limits);