1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002-2013 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your option) any later version.
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public
19 License along with the GNU C Library; if not, see
20 <http://www.gnu.org/licenses/>. */
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 const re_dfastate_t *init_state,
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 static void optimize_utf8 (re_dfa_t *dfa);
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
52 unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 static int peek_token (re_token_t *token, re_string_t *input,
60 reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 Idx nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 re_dfa_t *dfa, re_token_t *token,
77 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 re_token_t *token, reg_syntax_t syntax,
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_token_t *token, int token_len,
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 Idx *equiv_class_alloc,
94 const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 Idx *char_class_alloc,
99 const char *class_name,
100 reg_syntax_t syntax);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 const char *class_name,
107 reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 RE_TRANSLATE_TYPE trans,
111 const char *class_name,
113 bool non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 bin_tree_t *left, bin_tree_t *right,
119 const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 static const char __re_error_msgid[] =
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 static const size_t __re_error_msgid_idx[] =
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
212 are set in BUFP on entry. */
216 re_compile_pattern (pattern, length, bufp)
219 struct re_pattern_buffer *bufp;
220 #else /* size_t might promote */
222 re_compile_pattern (const char *pattern, size_t length,
223 struct re_pattern_buffer *bufp)
228 /* And GNU code determines whether or not to get register information
229 by passing null for the REGS argument to re_match, etc., not by
230 setting no_sub, unless RE_NO_SUB is set. */
231 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233 /* Match anchors at newline. */
234 bufp->newline_anchor = 1;
236 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
240 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
243 weak_alias (__re_compile_pattern, re_compile_pattern)
246 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
247 also be assigned to arbitrarily: each pattern buffer stores its own
248 syntax, so it can be changed between regex compilations. */
249 /* This has no initializer because initialized variables in Emacs
250 become read-only after dumping. */
251 reg_syntax_t re_syntax_options;
254 /* Specify the precise syntax of regexps for compilation. This provides
255 for compatibility for various utilities which historically have
256 different, incompatible syntaxes.
258 The argument SYNTAX is a bit mask comprised of the various bits
259 defined in regex.h. We return the old syntax. */
262 re_set_syntax (syntax)
265 reg_syntax_t ret = re_syntax_options;
267 re_syntax_options = syntax;
271 weak_alias (__re_set_syntax, re_set_syntax)
275 re_compile_fastmap (bufp)
276 struct re_pattern_buffer *bufp;
278 re_dfa_t *dfa = bufp->buffer;
279 char *fastmap = bufp->fastmap;
281 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
282 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
283 if (dfa->init_state != dfa->init_state_word)
284 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
285 if (dfa->init_state != dfa->init_state_nl)
286 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
287 if (dfa->init_state != dfa->init_state_begbuf)
288 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
289 bufp->fastmap_accurate = 1;
293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
297 __attribute__ ((always_inline))
298 re_set_fastmap (char *fastmap, bool icase, int ch)
302 fastmap[tolower (ch)] = 1;
305 /* Helper function for re_compile_fastmap.
306 Compile fastmap for the initial_state INIT_STATE. */
309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
312 re_dfa_t *dfa = bufp->buffer;
314 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
315 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317 Idx node = init_state->nodes.elems[node_cnt];
318 re_token_type_t type = dfa->nodes[node].type;
320 if (type == CHARACTER)
322 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
323 #ifdef RE_ENABLE_I18N
324 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 unsigned char buf[MB_LEN_MAX];
332 *p++ = dfa->nodes[node].opr.c;
333 while (++node < dfa->nodes_len
334 && dfa->nodes[node].type == CHARACTER
335 && dfa->nodes[node].mb_partial)
336 *p++ = dfa->nodes[node].opr.c;
337 memset (&state, '\0', sizeof (state));
338 if (__mbrtowc (&wc, (const char *) buf, p - buf,
340 && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 re_set_fastmap (fastmap, false, buf[0]);
346 else if (type == SIMPLE_BRACKET)
349 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
352 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
353 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
354 if (w & ((bitset_word_t) 1 << j))
355 re_set_fastmap (fastmap, icase, ch);
358 #ifdef RE_ENABLE_I18N
359 else if (type == COMPLEX_BRACKET)
361 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
365 /* See if we have to try all bytes which start multiple collation
367 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
368 collation element, and don't catch 'b' since 'b' is
369 the only collation element which starts from 'b' (and
370 it is caught by SIMPLE_BRACKET). */
371 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
372 && (cset->ncoll_syms || cset->nranges))
374 const int32_t *table = (const int32_t *)
375 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
376 for (i = 0; i < SBC_MAX; ++i)
378 re_set_fastmap (fastmap, icase, i);
382 /* See if we have to start the match at all multibyte characters,
383 i.e. where we would not find an invalid sequence. This only
384 applies to multibyte character sets; for single byte character
385 sets, the SIMPLE_BRACKET again suffices. */
386 if (dfa->mb_cur_max > 1
387 && (cset->nchar_classes || cset->non_match || cset->nranges
389 || cset->nequiv_classes
397 memset (&mbs, 0, sizeof (mbs));
398 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
399 re_set_fastmap (fastmap, false, (int) c);
406 /* ... Else catch all bytes which can start the mbchars. */
407 for (i = 0; i < cset->nmbchars; ++i)
411 memset (&state, '\0', sizeof (state));
412 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
423 #endif /* RE_ENABLE_I18N */
424 else if (type == OP_PERIOD
425 #ifdef RE_ENABLE_I18N
426 || type == OP_UTF8_PERIOD
427 #endif /* RE_ENABLE_I18N */
428 || type == END_OF_RE)
430 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431 if (type == END_OF_RE)
432 bufp->can_be_null = 1;
438 /* Entry point for POSIX code. */
439 /* regcomp takes a regular expression as a string and compiles it.
441 PREG is a regex_t *. We do not expect any fields to be initialized,
442 since POSIX says we shouldn't. Thus, we set
444 'buffer' to the compiled pattern;
445 'used' to the length of the compiled pattern;
446 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447 REG_EXTENDED bit in CFLAGS is set; otherwise, to
448 RE_SYNTAX_POSIX_BASIC;
449 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
450 'fastmap' to an allocated space for the fastmap;
451 'fastmap_accurate' to zero;
452 're_nsub' to the number of subexpressions in PATTERN.
454 PATTERN is the address of the pattern string.
456 CFLAGS is a series of bits which affect compilation.
458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459 use POSIX basic syntax.
461 If REG_NEWLINE is set, then . and [^...] don't match newline.
462 Also, regexec will try a match beginning after every newline.
464 If REG_ICASE is set, then we considers upper- and lowercase
465 versions of letters to be equivalent when matching.
467 If REG_NOSUB is set, then when PREG is passed to regexec, that
468 routine will report only success or failure, and nothing about the
471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472 the return codes and their meanings.) */
475 regcomp (preg, pattern, cflags)
476 regex_t *_Restrict_ preg;
477 const char *_Restrict_ pattern;
481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482 : RE_SYNTAX_POSIX_BASIC);
488 /* Try to allocate space for the fastmap. */
489 preg->fastmap = re_malloc (char, SBC_MAX);
490 if (BE (preg->fastmap == NULL, 0))
493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495 /* If REG_NEWLINE is set, newlines are treated differently. */
496 if (cflags & REG_NEWLINE)
497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
498 syntax &= ~RE_DOT_NEWLINE;
499 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500 /* It also changes the matching behavior. */
501 preg->newline_anchor = 1;
504 preg->newline_anchor = 0;
505 preg->no_sub = !!(cflags & REG_NOSUB);
506 preg->translate = NULL;
508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510 /* POSIX doesn't distinguish between an unmatched open-group and an
511 unmatched close-group: both are REG_EPAREN. */
512 if (ret == REG_ERPAREN)
515 /* We have already checked preg->fastmap != NULL. */
516 if (BE (ret == REG_NOERROR, 1))
517 /* Compute the fastmap now, since regexec cannot modify the pattern
518 buffer. This function never fails in this implementation. */
519 (void) re_compile_fastmap (preg);
522 /* Some error occurred while compiling the expression. */
523 re_free (preg->fastmap);
524 preg->fastmap = NULL;
530 weak_alias (__regcomp, regcomp)
533 /* Returns a message corresponding to an error code, ERRCODE, returned
534 from either regcomp or regexec. We don't use PREG here. */
538 regerror (errcode, preg, errbuf, errbuf_size)
540 const regex_t *_Restrict_ preg;
541 char *_Restrict_ errbuf;
543 #else /* size_t might promote */
545 regerror (int errcode, const regex_t *_Restrict_ preg,
546 char *_Restrict_ errbuf, size_t errbuf_size)
553 || errcode >= (int) (sizeof (__re_error_msgid_idx)
554 / sizeof (__re_error_msgid_idx[0])), 0))
555 /* Only error codes returned by the rest of the code should be passed
556 to this routine. If we are given anything else, or if other regex
557 code generates an invalid error code, then the program has a bug.
558 Dump core so we can fix it. */
561 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563 msg_size = strlen (msg) + 1; /* Includes the null. */
565 if (BE (errbuf_size != 0, 1))
567 size_t cpy_size = msg_size;
568 if (BE (msg_size > errbuf_size, 0))
570 cpy_size = errbuf_size - 1;
571 errbuf[cpy_size] = '\0';
573 memcpy (errbuf, msg, cpy_size);
579 weak_alias (__regerror, regerror)
583 #ifdef RE_ENABLE_I18N
584 /* This static array is used for the map to single-byte characters when
585 UTF-8 is used. Otherwise we would allocate memory just to initialize
586 it the same all the time. UTF-8 is the preferred encoding so this is
587 a worthwhile optimization. */
588 static const bitset_t utf8_sb_map =
590 /* Set the first 128 bits. */
591 # if defined __GNUC__ && !defined __STRICT_ANSI__
592 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
594 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
595 # error "bitset_word_t is narrower than 32 bits"
596 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX, BITSET_WORD_MAX,
600 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
604 >> (SBC_MAX % BITSET_WORD_BITS == 0
606 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
613 free_dfa_content (re_dfa_t *dfa)
618 for (i = 0; i < dfa->nodes_len; ++i)
619 free_token (dfa->nodes + i);
620 re_free (dfa->nexts);
621 for (i = 0; i < dfa->nodes_len; ++i)
623 if (dfa->eclosures != NULL)
624 re_node_set_free (dfa->eclosures + i);
625 if (dfa->inveclosures != NULL)
626 re_node_set_free (dfa->inveclosures + i);
627 if (dfa->edests != NULL)
628 re_node_set_free (dfa->edests + i);
630 re_free (dfa->edests);
631 re_free (dfa->eclosures);
632 re_free (dfa->inveclosures);
633 re_free (dfa->nodes);
635 if (dfa->state_table)
636 for (i = 0; i <= dfa->state_hash_mask; ++i)
638 struct re_state_table_entry *entry = dfa->state_table + i;
639 for (j = 0; j < entry->num; ++j)
641 re_dfastate_t *state = entry->array[j];
644 re_free (entry->array);
646 re_free (dfa->state_table);
647 #ifdef RE_ENABLE_I18N
648 if (dfa->sb_char != utf8_sb_map)
649 re_free (dfa->sb_char);
651 re_free (dfa->subexp_map);
653 re_free (dfa->re_str);
660 /* Free dynamically allocated space used by PREG. */
666 re_dfa_t *dfa = preg->buffer;
667 if (BE (dfa != NULL, 1))
669 lock_fini (dfa->lock);
670 free_dfa_content (dfa);
675 re_free (preg->fastmap);
676 preg->fastmap = NULL;
678 re_free (preg->translate);
679 preg->translate = NULL;
682 weak_alias (__regfree, regfree)
685 /* Entry points compatible with 4.2 BSD regex library. We don't define
686 them unless specifically requested. */
688 #if defined _REGEX_RE_COMP || defined _LIBC
690 /* BSD has one and only one pattern buffer. */
691 static struct re_pattern_buffer re_comp_buf;
695 /* Make these definitions weak in libc, so POSIX programs can redefine
696 these names if they don't use our functions, and still use
697 regcomp/regexec above without link errors. */
708 if (!re_comp_buf.buffer)
709 return gettext ("No previous regular expression");
713 if (re_comp_buf.buffer)
715 fastmap = re_comp_buf.fastmap;
716 re_comp_buf.fastmap = NULL;
717 __regfree (&re_comp_buf);
718 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
719 re_comp_buf.fastmap = fastmap;
722 if (re_comp_buf.fastmap == NULL)
724 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
725 if (re_comp_buf.fastmap == NULL)
726 return (char *) gettext (__re_error_msgid
727 + __re_error_msgid_idx[(int) REG_ESPACE]);
730 /* Since 're_exec' always passes NULL for the 'regs' argument, we
731 don't need to initialize the pattern buffer fields which affect it. */
733 /* Match anchors at newlines. */
734 re_comp_buf.newline_anchor = 1;
736 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
741 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
742 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
746 libc_freeres_fn (free_mem)
748 __regfree (&re_comp_buf);
752 #endif /* _REGEX_RE_COMP */
754 /* Internal entry point.
755 Compile the regular expression PATTERN, whose length is LENGTH.
756 SYNTAX indicate regular expression's syntax. */
759 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
762 reg_errcode_t err = REG_NOERROR;
766 /* Initialize the pattern buffer. */
767 preg->fastmap_accurate = 0;
768 preg->syntax = syntax;
769 preg->not_bol = preg->not_eol = 0;
772 preg->can_be_null = 0;
773 preg->regs_allocated = REGS_UNALLOCATED;
775 /* Initialize the dfa. */
777 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
779 /* If zero allocated, but buffer is non-null, try to realloc
780 enough space. This loses if buffer's address is bogus, but
781 that is the user's responsibility. If ->buffer is NULL this
782 is a simple allocation. */
783 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
786 preg->allocated = sizeof (re_dfa_t);
789 preg->used = sizeof (re_dfa_t);
791 err = init_dfa (dfa, length);
792 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
794 if (BE (err != REG_NOERROR, 0))
796 free_dfa_content (dfa);
802 /* Note: length+1 will not overflow since it is checked in init_dfa. */
803 dfa->re_str = re_malloc (char, length + 1);
804 strncpy (dfa->re_str, pattern, length + 1);
807 err = re_string_construct (®exp, pattern, length, preg->translate,
808 (syntax & RE_ICASE) != 0, dfa);
809 if (BE (err != REG_NOERROR, 0))
811 re_compile_internal_free_return:
812 free_workarea_compile (preg);
813 re_string_destruct (®exp);
814 lock_fini (dfa->lock);
815 free_dfa_content (dfa);
821 /* Parse the regular expression, and build a structure tree. */
823 dfa->str_tree = parse (®exp, preg, syntax, &err);
824 if (BE (dfa->str_tree == NULL, 0))
825 goto re_compile_internal_free_return;
827 /* Analyze the tree and create the nfa. */
828 err = analyze (preg);
829 if (BE (err != REG_NOERROR, 0))
830 goto re_compile_internal_free_return;
832 #ifdef RE_ENABLE_I18N
833 /* If possible, do searching in single byte encoding to speed things up. */
834 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
838 /* Then create the initial state of the dfa. */
839 err = create_initial_state (dfa);
841 /* Release work areas. */
842 free_workarea_compile (preg);
843 re_string_destruct (®exp);
845 if (BE (err != REG_NOERROR, 0))
847 lock_fini (dfa->lock);
848 free_dfa_content (dfa);
856 /* Initialize DFA. We use the length of the regular expression PAT_LEN
857 as the initial length of some arrays. */
860 init_dfa (re_dfa_t *dfa, size_t pat_len)
862 __re_size_t table_size;
864 const char *codeset_name;
866 #ifdef RE_ENABLE_I18N
867 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
869 size_t max_i18n_object_size = 0;
871 size_t max_object_size =
872 MAX (sizeof (struct re_state_table_entry),
873 MAX (sizeof (re_token_t),
874 MAX (sizeof (re_node_set),
875 MAX (sizeof (regmatch_t),
876 max_i18n_object_size))));
878 memset (dfa, '\0', sizeof (re_dfa_t));
880 /* Force allocation of str_tree_storage the first time. */
881 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
883 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
884 calculation below, and for similar doubling calculations
885 elsewhere. And it's <= rather than <, because some of the
886 doubling calculations add 1 afterwards. */
887 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
890 dfa->nodes_alloc = pat_len + 1;
891 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
893 /* table_size = 2 ^ ceil(log pat_len) */
894 for (table_size = 1; ; table_size <<= 1)
895 if (table_size > pat_len)
898 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
899 dfa->state_hash_mask = table_size - 1;
901 dfa->mb_cur_max = MB_CUR_MAX;
903 if (dfa->mb_cur_max == 6
904 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
906 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
909 codeset_name = nl_langinfo (CODESET);
910 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
911 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
912 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
913 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
916 /* We check exhaustively in the loop below if this charset is a
917 superset of ASCII. */
918 dfa->map_notascii = 0;
921 #ifdef RE_ENABLE_I18N
922 if (dfa->mb_cur_max > 1)
925 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
930 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
931 if (BE (dfa->sb_char == NULL, 0))
934 /* Set the bits corresponding to single byte chars. */
935 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
936 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
938 wint_t wch = __btowc (ch);
940 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
942 if (isascii (ch) && wch != ch)
943 dfa->map_notascii = 1;
950 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
955 /* Initialize WORD_CHAR table, which indicate which character is
956 "word". In this case "word" means that it is the word construction
957 character used by some operators like "\<", "\>", etc. */
961 init_word_char (re_dfa_t *dfa)
966 dfa->word_ops_used = 1;
967 if (BE (dfa->map_notascii == 0, 1))
969 bitset_word_t bits0 = 0x00000000;
970 bitset_word_t bits1 = 0x03ff0000;
971 bitset_word_t bits2 = 0x87fffffe;
972 bitset_word_t bits3 = 0x07fffffe;
973 if (BITSET_WORD_BITS == 64)
975 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
976 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
979 else if (BITSET_WORD_BITS == 32)
981 dfa->word_char[0] = bits0;
982 dfa->word_char[1] = bits1;
983 dfa->word_char[2] = bits2;
984 dfa->word_char[3] = bits3;
991 if (BE (dfa->is_utf8, 1))
993 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
999 for (; i < BITSET_WORDS; ++i)
1000 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1001 if (isalnum (ch) || ch == '_')
1002 dfa->word_char[i] |= (bitset_word_t) 1 << j;
1005 /* Free the work area which are only used while compiling. */
1008 free_workarea_compile (regex_t *preg)
1010 re_dfa_t *dfa = preg->buffer;
1011 bin_tree_storage_t *storage, *next;
1012 for (storage = dfa->str_tree_storage; storage; storage = next)
1014 next = storage->next;
1017 dfa->str_tree_storage = NULL;
1018 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1019 dfa->str_tree = NULL;
1020 re_free (dfa->org_indices);
1021 dfa->org_indices = NULL;
1024 /* Create initial states for all contexts. */
1026 static reg_errcode_t
1027 create_initial_state (re_dfa_t *dfa)
1031 re_node_set init_nodes;
1033 /* Initial states have the epsilon closure of the node which is
1034 the first node of the regular expression. */
1035 first = dfa->str_tree->first->node_idx;
1036 dfa->init_node = first;
1037 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1038 if (BE (err != REG_NOERROR, 0))
1041 /* The back-references which are in initial states can epsilon transit,
1042 since in this case all of the subexpressions can be null.
1043 Then we add epsilon closures of the nodes which are the next nodes of
1044 the back-references. */
1045 if (dfa->nbackref > 0)
1046 for (i = 0; i < init_nodes.nelem; ++i)
1048 Idx node_idx = init_nodes.elems[i];
1049 re_token_type_t type = dfa->nodes[node_idx].type;
1052 if (type != OP_BACK_REF)
1054 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1056 re_token_t *clexp_node;
1057 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1058 if (clexp_node->type == OP_CLOSE_SUBEXP
1059 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1062 if (clexp_idx == init_nodes.nelem)
1065 if (type == OP_BACK_REF)
1067 Idx dest_idx = dfa->edests[node_idx].elems[0];
1068 if (!re_node_set_contains (&init_nodes, dest_idx))
1070 reg_errcode_t merge_err
1071 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1072 if (merge_err != REG_NOERROR)
1079 /* It must be the first time to invoke acquire_state. */
1080 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1081 /* We don't check ERR here, since the initial state must not be NULL. */
1082 if (BE (dfa->init_state == NULL, 0))
1084 if (dfa->init_state->has_constraint)
1086 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1088 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1090 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1094 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1095 || dfa->init_state_begbuf == NULL, 0))
1099 dfa->init_state_word = dfa->init_state_nl
1100 = dfa->init_state_begbuf = dfa->init_state;
1102 re_node_set_free (&init_nodes);
1106 #ifdef RE_ENABLE_I18N
1107 /* If it is possible to do searching in single byte encoding instead of UTF-8
1108 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1109 DFA nodes where needed. */
1112 optimize_utf8 (re_dfa_t *dfa)
1116 bool mb_chars = false;
1117 bool has_period = false;
1119 for (node = 0; node < dfa->nodes_len; ++node)
1120 switch (dfa->nodes[node].type)
1123 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1127 switch (dfa->nodes[node].opr.ctx_type)
1135 /* Word anchors etc. cannot be handled. It's okay to test
1136 opr.ctx_type since constraints (for all DFA nodes) are
1137 created by ORing one or more opr.ctx_type values. */
1147 case OP_DUP_ASTERISK:
1148 case OP_OPEN_SUBEXP:
1149 case OP_CLOSE_SUBEXP:
1151 case COMPLEX_BRACKET:
1153 case SIMPLE_BRACKET:
1154 /* Just double check. */
1156 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1158 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1159 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1161 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1171 if (mb_chars || has_period)
1172 for (node = 0; node < dfa->nodes_len; ++node)
1174 if (dfa->nodes[node].type == CHARACTER
1175 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1176 dfa->nodes[node].mb_partial = 0;
1177 else if (dfa->nodes[node].type == OP_PERIOD)
1178 dfa->nodes[node].type = OP_UTF8_PERIOD;
1181 /* The search can be in single byte locale. */
1182 dfa->mb_cur_max = 1;
1184 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1188 /* Analyze the structure tree, and calculate "first", "next", "edest",
1189 "eclosure", and "inveclosure". */
1191 static reg_errcode_t
1192 analyze (regex_t *preg)
1194 re_dfa_t *dfa = preg->buffer;
1197 /* Allocate arrays. */
1198 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1199 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1200 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1201 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1202 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1203 || dfa->eclosures == NULL, 0))
1206 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1207 if (dfa->subexp_map != NULL)
1210 for (i = 0; i < preg->re_nsub; i++)
1211 dfa->subexp_map[i] = i;
1212 preorder (dfa->str_tree, optimize_subexps, dfa);
1213 for (i = 0; i < preg->re_nsub; i++)
1214 if (dfa->subexp_map[i] != i)
1216 if (i == preg->re_nsub)
1218 free (dfa->subexp_map);
1219 dfa->subexp_map = NULL;
1223 ret = postorder (dfa->str_tree, lower_subexps, preg);
1224 if (BE (ret != REG_NOERROR, 0))
1226 ret = postorder (dfa->str_tree, calc_first, dfa);
1227 if (BE (ret != REG_NOERROR, 0))
1229 preorder (dfa->str_tree, calc_next, dfa);
1230 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1231 if (BE (ret != REG_NOERROR, 0))
1233 ret = calc_eclosure (dfa);
1234 if (BE (ret != REG_NOERROR, 0))
1237 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1238 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1239 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1242 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1243 if (BE (dfa->inveclosures == NULL, 0))
1245 ret = calc_inveclosure (dfa);
1251 /* Our parse trees are very unbalanced, so we cannot use a stack to
1252 implement parse tree visits. Instead, we use parent pointers and
1253 some hairy code in these two functions. */
1254 static reg_errcode_t
1255 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1258 bin_tree_t *node, *prev;
1260 for (node = root; ; )
1262 /* Descend down the tree, preferably to the left (or to the right
1263 if that's the only child). */
1264 while (node->left || node->right)
1272 reg_errcode_t err = fn (extra, node);
1273 if (BE (err != REG_NOERROR, 0))
1275 if (node->parent == NULL)
1278 node = node->parent;
1280 /* Go up while we have a node that is reached from the right. */
1281 while (node->right == prev || node->right == NULL);
1286 static reg_errcode_t
1287 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1292 for (node = root; ; )
1294 reg_errcode_t err = fn (extra, node);
1295 if (BE (err != REG_NOERROR, 0))
1298 /* Go to the left node, or up and to the right. */
1303 bin_tree_t *prev = NULL;
1304 while (node->right == prev || node->right == NULL)
1307 node = node->parent;
1316 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1317 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1318 backreferences as well. Requires a preorder visit. */
1319 static reg_errcode_t
1320 optimize_subexps (void *extra, bin_tree_t *node)
1322 re_dfa_t *dfa = (re_dfa_t *) extra;
1324 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1326 int idx = node->token.opr.idx;
1327 node->token.opr.idx = dfa->subexp_map[idx];
1328 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1331 else if (node->token.type == SUBEXP
1332 && node->left && node->left->token.type == SUBEXP)
1334 Idx other_idx = node->left->token.opr.idx;
1336 node->left = node->left->left;
1338 node->left->parent = node;
1340 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1341 if (other_idx < BITSET_WORD_BITS)
1342 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1348 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1349 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1350 static reg_errcode_t
1351 lower_subexps (void *extra, bin_tree_t *node)
1353 regex_t *preg = (regex_t *) extra;
1354 reg_errcode_t err = REG_NOERROR;
1356 if (node->left && node->left->token.type == SUBEXP)
1358 node->left = lower_subexp (&err, preg, node->left);
1360 node->left->parent = node;
1362 if (node->right && node->right->token.type == SUBEXP)
1364 node->right = lower_subexp (&err, preg, node->right);
1366 node->right->parent = node;
1373 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1375 re_dfa_t *dfa = preg->buffer;
1376 bin_tree_t *body = node->left;
1377 bin_tree_t *op, *cls, *tree1, *tree;
1380 /* We do not optimize empty subexpressions, because otherwise we may
1381 have bad CONCAT nodes with NULL children. This is obviously not
1382 very common, so we do not lose much. An example that triggers
1383 this case is the sed "script" /\(\)/x. */
1384 && node->left != NULL
1385 && (node->token.opr.idx >= BITSET_WORD_BITS
1386 || !(dfa->used_bkref_map
1387 & ((bitset_word_t) 1 << node->token.opr.idx))))
1390 /* Convert the SUBEXP node to the concatenation of an
1391 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1392 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1393 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1394 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1395 tree = create_tree (dfa, op, tree1, CONCAT);
1396 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1402 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1403 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1407 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1408 nodes. Requires a postorder visit. */
1409 static reg_errcode_t
1410 calc_first (void *extra, bin_tree_t *node)
1412 re_dfa_t *dfa = (re_dfa_t *) extra;
1413 if (node->token.type == CONCAT)
1415 node->first = node->left->first;
1416 node->node_idx = node->left->node_idx;
1421 node->node_idx = re_dfa_add_node (dfa, node->token);
1422 if (BE (node->node_idx == REG_MISSING, 0))
1424 if (node->token.type == ANCHOR)
1425 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1430 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1431 static reg_errcode_t
1432 calc_next (void *extra, bin_tree_t *node)
1434 switch (node->token.type)
1436 case OP_DUP_ASTERISK:
1437 node->left->next = node;
1440 node->left->next = node->right->first;
1441 node->right->next = node->next;
1445 node->left->next = node->next;
1447 node->right->next = node->next;
1453 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1454 static reg_errcode_t
1455 link_nfa_nodes (void *extra, bin_tree_t *node)
1457 re_dfa_t *dfa = (re_dfa_t *) extra;
1458 Idx idx = node->node_idx;
1459 reg_errcode_t err = REG_NOERROR;
1461 switch (node->token.type)
1467 assert (node->next == NULL);
1470 case OP_DUP_ASTERISK:
1474 dfa->has_plural_match = 1;
1475 if (node->left != NULL)
1476 left = node->left->first->node_idx;
1478 left = node->next->node_idx;
1479 if (node->right != NULL)
1480 right = node->right->first->node_idx;
1482 right = node->next->node_idx;
1483 assert (REG_VALID_INDEX (left));
1484 assert (REG_VALID_INDEX (right));
1485 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1490 case OP_OPEN_SUBEXP:
1491 case OP_CLOSE_SUBEXP:
1492 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1496 dfa->nexts[idx] = node->next->node_idx;
1497 if (node->token.type == OP_BACK_REF)
1498 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1502 assert (!IS_EPSILON_NODE (node->token.type));
1503 dfa->nexts[idx] = node->next->node_idx;
1510 /* Duplicate the epsilon closure of the node ROOT_NODE.
1511 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1512 to their own constraint. */
1514 static reg_errcode_t
1516 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1517 Idx root_node, unsigned int init_constraint)
1519 Idx org_node, clone_node;
1521 unsigned int constraint = init_constraint;
1522 for (org_node = top_org_node, clone_node = top_clone_node;;)
1524 Idx org_dest, clone_dest;
1525 if (dfa->nodes[org_node].type == OP_BACK_REF)
1527 /* If the back reference epsilon-transit, its destination must
1528 also have the constraint. Then duplicate the epsilon closure
1529 of the destination of the back reference, and store it in
1530 edests of the back reference. */
1531 org_dest = dfa->nexts[org_node];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 clone_dest = duplicate_node (dfa, org_dest, constraint);
1534 if (BE (clone_dest == REG_MISSING, 0))
1536 dfa->nexts[clone_node] = dfa->nexts[org_node];
1537 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541 else if (dfa->edests[org_node].nelem == 0)
1543 /* In case of the node can't epsilon-transit, don't duplicate the
1544 destination and store the original destination as the
1545 destination of the node. */
1546 dfa->nexts[clone_node] = dfa->nexts[org_node];
1549 else if (dfa->edests[org_node].nelem == 1)
1551 /* In case of the node can epsilon-transit, and it has only one
1553 org_dest = dfa->edests[org_node].elems[0];
1554 re_node_set_empty (dfa->edests + clone_node);
1555 /* If the node is root_node itself, it means the epsilon closure
1556 has a loop. Then tie it to the destination of the root_node. */
1557 if (org_node == root_node && clone_node != org_node)
1559 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1564 /* In case the node has another constraint, append it. */
1565 constraint |= dfa->nodes[org_node].constraint;
1566 clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 if (BE (clone_dest == REG_MISSING, 0))
1569 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573 else /* dfa->edests[org_node].nelem == 2 */
1575 /* In case of the node can epsilon-transit, and it has two
1576 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1577 org_dest = dfa->edests[org_node].elems[0];
1578 re_node_set_empty (dfa->edests + clone_node);
1579 /* Search for a duplicated node which satisfies the constraint. */
1580 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1581 if (clone_dest == REG_MISSING)
1583 /* There is no such duplicated node, create a new one. */
1585 clone_dest = duplicate_node (dfa, org_dest, constraint);
1586 if (BE (clone_dest == REG_MISSING, 0))
1588 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1592 root_node, constraint);
1593 if (BE (err != REG_NOERROR, 0))
1598 /* There is a duplicated node which satisfies the constraint,
1599 use it to avoid infinite loop. */
1600 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1605 org_dest = dfa->edests[org_node].elems[1];
1606 clone_dest = duplicate_node (dfa, org_dest, constraint);
1607 if (BE (clone_dest == REG_MISSING, 0))
1609 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1613 org_node = org_dest;
1614 clone_node = clone_dest;
1619 /* Search for a node which is duplicated from the node ORG_NODE, and
1620 satisfies the constraint CONSTRAINT. */
1623 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1624 unsigned int constraint)
1627 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1629 if (org_node == dfa->org_indices[idx]
1630 && constraint == dfa->nodes[idx].constraint)
1631 return idx; /* Found. */
1633 return REG_MISSING; /* Not found. */
1636 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1637 Return the index of the new node, or REG_MISSING if insufficient storage is
1641 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1643 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1644 if (BE (dup_idx != REG_MISSING, 1))
1646 dfa->nodes[dup_idx].constraint = constraint;
1647 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1648 dfa->nodes[dup_idx].duplicated = 1;
1650 /* Store the index of the original node. */
1651 dfa->org_indices[dup_idx] = org_idx;
1656 static reg_errcode_t
1657 calc_inveclosure (re_dfa_t *dfa)
1661 for (idx = 0; idx < dfa->nodes_len; ++idx)
1662 re_node_set_init_empty (dfa->inveclosures + idx);
1664 for (src = 0; src < dfa->nodes_len; ++src)
1666 Idx *elems = dfa->eclosures[src].elems;
1667 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1669 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1678 /* Calculate "eclosure" for all the node in DFA. */
1680 static reg_errcode_t
1681 calc_eclosure (re_dfa_t *dfa)
1686 assert (dfa->nodes_len > 0);
1689 /* For each nodes, calculate epsilon closure. */
1690 for (node_idx = 0; ; ++node_idx)
1693 re_node_set eclosure_elem;
1694 if (node_idx == dfa->nodes_len)
1703 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1706 /* If we have already calculated, skip it. */
1707 if (dfa->eclosures[node_idx].nelem != 0)
1709 /* Calculate epsilon closure of 'node_idx'. */
1710 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1711 if (BE (err != REG_NOERROR, 0))
1714 if (dfa->eclosures[node_idx].nelem == 0)
1717 re_node_set_free (&eclosure_elem);
1723 /* Calculate epsilon closure of NODE. */
1725 static reg_errcode_t
1726 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1730 re_node_set eclosure;
1732 bool incomplete = false;
1733 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1734 if (BE (err != REG_NOERROR, 0))
1737 /* This indicates that we are calculating this node now.
1738 We reference this value to avoid infinite loop. */
1739 dfa->eclosures[node].nelem = REG_MISSING;
1741 /* If the current node has constraints, duplicate all nodes
1742 since they must inherit the constraints. */
1743 if (dfa->nodes[node].constraint
1744 && dfa->edests[node].nelem
1745 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1747 err = duplicate_node_closure (dfa, node, node, node,
1748 dfa->nodes[node].constraint);
1749 if (BE (err != REG_NOERROR, 0))
1753 /* Expand each epsilon destination nodes. */
1754 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1755 for (i = 0; i < dfa->edests[node].nelem; ++i)
1757 re_node_set eclosure_elem;
1758 Idx edest = dfa->edests[node].elems[i];
1759 /* If calculating the epsilon closure of 'edest' is in progress,
1760 return intermediate result. */
1761 if (dfa->eclosures[edest].nelem == REG_MISSING)
1766 /* If we haven't calculated the epsilon closure of 'edest' yet,
1767 calculate now. Otherwise use calculated epsilon closure. */
1768 if (dfa->eclosures[edest].nelem == 0)
1770 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1771 if (BE (err != REG_NOERROR, 0))
1775 eclosure_elem = dfa->eclosures[edest];
1776 /* Merge the epsilon closure of 'edest'. */
1777 err = re_node_set_merge (&eclosure, &eclosure_elem);
1778 if (BE (err != REG_NOERROR, 0))
1780 /* If the epsilon closure of 'edest' is incomplete,
1781 the epsilon closure of this node is also incomplete. */
1782 if (dfa->eclosures[edest].nelem == 0)
1785 re_node_set_free (&eclosure_elem);
1789 /* An epsilon closure includes itself. */
1790 ok = re_node_set_insert (&eclosure, node);
1793 if (incomplete && !root)
1794 dfa->eclosures[node].nelem = 0;
1796 dfa->eclosures[node] = eclosure;
1797 *new_set = eclosure;
1801 /* Functions for token which are used in the parser. */
1803 /* Fetch a token from INPUT.
1804 We must not use this function inside bracket expressions. */
1808 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1810 re_string_skip_bytes (input, peek_token (result, input, syntax));
1813 /* Peek a token from INPUT, and return the length of the token.
1814 We must not use this function inside bracket expressions. */
1818 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1822 if (re_string_eoi (input))
1824 token->type = END_OF_RE;
1828 c = re_string_peek_byte (input, 0);
1831 token->word_char = 0;
1832 #ifdef RE_ENABLE_I18N
1833 token->mb_partial = 0;
1834 if (input->mb_cur_max > 1 &&
1835 !re_string_first_byte (input, re_string_cur_idx (input)))
1837 token->type = CHARACTER;
1838 token->mb_partial = 1;
1845 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1847 token->type = BACK_SLASH;
1851 c2 = re_string_peek_byte_case (input, 1);
1853 token->type = CHARACTER;
1854 #ifdef RE_ENABLE_I18N
1855 if (input->mb_cur_max > 1)
1857 wint_t wc = re_string_wchar_at (input,
1858 re_string_cur_idx (input) + 1);
1859 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1863 token->word_char = IS_WORD_CHAR (c2) != 0;
1868 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1869 token->type = OP_ALT;
1871 case '1': case '2': case '3': case '4': case '5':
1872 case '6': case '7': case '8': case '9':
1873 if (!(syntax & RE_NO_BK_REFS))
1875 token->type = OP_BACK_REF;
1876 token->opr.idx = c2 - '1';
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = WORD_FIRST;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = WORD_LAST;
1894 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = WORD_DELIM;
1901 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = ANCHOR;
1904 token->opr.ctx_type = NOT_WORD_DELIM;
1908 if (!(syntax & RE_NO_GNU_OPS))
1909 token->type = OP_WORD;
1912 if (!(syntax & RE_NO_GNU_OPS))
1913 token->type = OP_NOTWORD;
1916 if (!(syntax & RE_NO_GNU_OPS))
1917 token->type = OP_SPACE;
1920 if (!(syntax & RE_NO_GNU_OPS))
1921 token->type = OP_NOTSPACE;
1924 if (!(syntax & RE_NO_GNU_OPS))
1926 token->type = ANCHOR;
1927 token->opr.ctx_type = BUF_FIRST;
1931 if (!(syntax & RE_NO_GNU_OPS))
1933 token->type = ANCHOR;
1934 token->opr.ctx_type = BUF_LAST;
1938 if (!(syntax & RE_NO_BK_PARENS))
1939 token->type = OP_OPEN_SUBEXP;
1942 if (!(syntax & RE_NO_BK_PARENS))
1943 token->type = OP_CLOSE_SUBEXP;
1946 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1947 token->type = OP_DUP_PLUS;
1950 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1951 token->type = OP_DUP_QUESTION;
1954 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1955 token->type = OP_OPEN_DUP_NUM;
1958 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1959 token->type = OP_CLOSE_DUP_NUM;
1967 token->type = CHARACTER;
1968 #ifdef RE_ENABLE_I18N
1969 if (input->mb_cur_max > 1)
1971 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1972 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1976 token->word_char = IS_WORD_CHAR (token->opr.c);
1981 if (syntax & RE_NEWLINE_ALT)
1982 token->type = OP_ALT;
1985 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1986 token->type = OP_ALT;
1989 token->type = OP_DUP_ASTERISK;
1992 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1993 token->type = OP_DUP_PLUS;
1996 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1997 token->type = OP_DUP_QUESTION;
2000 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2001 token->type = OP_OPEN_DUP_NUM;
2004 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2005 token->type = OP_CLOSE_DUP_NUM;
2008 if (syntax & RE_NO_BK_PARENS)
2009 token->type = OP_OPEN_SUBEXP;
2012 if (syntax & RE_NO_BK_PARENS)
2013 token->type = OP_CLOSE_SUBEXP;
2016 token->type = OP_OPEN_BRACKET;
2019 token->type = OP_PERIOD;
2022 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2023 re_string_cur_idx (input) != 0)
2025 char prev = re_string_peek_byte (input, -1);
2026 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2029 token->type = ANCHOR;
2030 token->opr.ctx_type = LINE_FIRST;
2033 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2034 re_string_cur_idx (input) + 1 != re_string_length (input))
2037 re_string_skip_bytes (input, 1);
2038 peek_token (&next, input, syntax);
2039 re_string_skip_bytes (input, -1);
2040 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2043 token->type = ANCHOR;
2044 token->opr.ctx_type = LINE_LAST;
2052 /* Peek a token from INPUT, and return the length of the token.
2053 We must not use this function out of bracket expressions. */
2057 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2060 if (re_string_eoi (input))
2062 token->type = END_OF_RE;
2065 c = re_string_peek_byte (input, 0);
2068 #ifdef RE_ENABLE_I18N
2069 if (input->mb_cur_max > 1 &&
2070 !re_string_first_byte (input, re_string_cur_idx (input)))
2072 token->type = CHARACTER;
2075 #endif /* RE_ENABLE_I18N */
2077 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2078 && re_string_cur_idx (input) + 1 < re_string_length (input))
2080 /* In this case, '\' escape a character. */
2082 re_string_skip_bytes (input, 1);
2083 c2 = re_string_peek_byte (input, 0);
2085 token->type = CHARACTER;
2088 if (c == '[') /* '[' is a special char in a bracket exps. */
2092 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2093 c2 = re_string_peek_byte (input, 1);
2101 token->type = OP_OPEN_COLL_ELEM;
2104 token->type = OP_OPEN_EQUIV_CLASS;
2107 if (syntax & RE_CHAR_CLASSES)
2109 token->type = OP_OPEN_CHAR_CLASS;
2112 /* else fall through. */
2114 token->type = CHARACTER;
2124 token->type = OP_CHARSET_RANGE;
2127 token->type = OP_CLOSE_BRACKET;
2130 token->type = OP_NON_MATCH_LIST;
2133 token->type = CHARACTER;
2138 /* Functions for parser. */
2140 /* Entry point of the parser.
2141 Parse the regular expression REGEXP and return the structure tree.
2142 If an error occurs, ERR is set by error code, and return NULL.
2143 This function build the following tree, from regular expression <reg_exp>:
2149 CAT means concatenation.
2150 EOR means end of regular expression. */
2153 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2156 re_dfa_t *dfa = preg->buffer;
2157 bin_tree_t *tree, *eor, *root;
2158 re_token_t current_token;
2159 dfa->syntax = syntax;
2160 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2161 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2162 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2164 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2166 root = create_tree (dfa, tree, eor, CONCAT);
2169 if (BE (eor == NULL || root == NULL, 0))
2177 /* This function build the following tree, from regular expression
2178 <branch1>|<branch2>:
2184 ALT means alternative, which represents the operator '|'. */
2187 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2188 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2190 re_dfa_t *dfa = preg->buffer;
2191 bin_tree_t *tree, *branch = NULL;
2192 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2193 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196 while (token->type == OP_ALT)
2198 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2199 if (token->type != OP_ALT && token->type != END_OF_RE
2200 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2202 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2203 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2208 tree = create_tree (dfa, tree, branch, OP_ALT);
2209 if (BE (tree == NULL, 0))
2218 /* This function build the following tree, from regular expression
2225 CAT means concatenation. */
2228 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2229 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2231 bin_tree_t *tree, *expr;
2232 re_dfa_t *dfa = preg->buffer;
2233 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2234 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2237 while (token->type != OP_ALT && token->type != END_OF_RE
2238 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2240 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2241 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2244 postorder (tree, free_tree, NULL);
2247 if (tree != NULL && expr != NULL)
2249 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2250 if (newtree == NULL)
2252 postorder (expr, free_tree, NULL);
2253 postorder (tree, free_tree, NULL);
2259 else if (tree == NULL)
2261 /* Otherwise expr == NULL, we don't need to create new tree. */
2266 /* This function build the following tree, from regular expression a*:
2273 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2274 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2276 re_dfa_t *dfa = preg->buffer;
2278 switch (token->type)
2281 tree = create_token_tree (dfa, NULL, NULL, token);
2282 if (BE (tree == NULL, 0))
2287 #ifdef RE_ENABLE_I18N
2288 if (dfa->mb_cur_max > 1)
2290 while (!re_string_eoi (regexp)
2291 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2293 bin_tree_t *mbc_remain;
2294 fetch_token (token, regexp, syntax);
2295 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2296 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2297 if (BE (mbc_remain == NULL || tree == NULL, 0))
2306 case OP_OPEN_SUBEXP:
2307 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2308 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2311 case OP_OPEN_BRACKET:
2312 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2313 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2317 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2322 dfa->used_bkref_map |= 1 << token->opr.idx;
2323 tree = create_token_tree (dfa, NULL, NULL, token);
2324 if (BE (tree == NULL, 0))
2330 dfa->has_mb_node = 1;
2332 case OP_OPEN_DUP_NUM:
2333 if (syntax & RE_CONTEXT_INVALID_DUP)
2339 case OP_DUP_ASTERISK:
2341 case OP_DUP_QUESTION:
2342 if (syntax & RE_CONTEXT_INVALID_OPS)
2347 else if (syntax & RE_CONTEXT_INDEP_OPS)
2349 fetch_token (token, regexp, syntax);
2350 return parse_expression (regexp, preg, token, syntax, nest, err);
2352 /* else fall through */
2353 case OP_CLOSE_SUBEXP:
2354 if ((token->type == OP_CLOSE_SUBEXP) &&
2355 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2360 /* else fall through */
2361 case OP_CLOSE_DUP_NUM:
2362 /* We treat it as a normal character. */
2364 /* Then we can these characters as normal characters. */
2365 token->type = CHARACTER;
2366 /* mb_partial and word_char bits should be initialized already
2368 tree = create_token_tree (dfa, NULL, NULL, token);
2369 if (BE (tree == NULL, 0))
2376 if ((token->opr.ctx_type
2377 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2378 && dfa->word_ops_used == 0)
2379 init_word_char (dfa);
2380 if (token->opr.ctx_type == WORD_DELIM
2381 || token->opr.ctx_type == NOT_WORD_DELIM)
2383 bin_tree_t *tree_first, *tree_last;
2384 if (token->opr.ctx_type == WORD_DELIM)
2386 token->opr.ctx_type = WORD_FIRST;
2387 tree_first = create_token_tree (dfa, NULL, NULL, token);
2388 token->opr.ctx_type = WORD_LAST;
2392 token->opr.ctx_type = INSIDE_WORD;
2393 tree_first = create_token_tree (dfa, NULL, NULL, token);
2394 token->opr.ctx_type = INSIDE_NOTWORD;
2396 tree_last = create_token_tree (dfa, NULL, NULL, token);
2397 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2398 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2406 tree = create_token_tree (dfa, NULL, NULL, token);
2407 if (BE (tree == NULL, 0))
2413 /* We must return here, since ANCHORs can't be followed
2414 by repetition operators.
2415 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2416 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2417 fetch_token (token, regexp, syntax);
2420 tree = create_token_tree (dfa, NULL, NULL, token);
2421 if (BE (tree == NULL, 0))
2426 if (dfa->mb_cur_max > 1)
2427 dfa->has_mb_node = 1;
2431 tree = build_charclass_op (dfa, regexp->trans,
2434 token->type == OP_NOTWORD, err);
2435 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2440 tree = build_charclass_op (dfa, regexp->trans,
2443 token->type == OP_NOTSPACE, err);
2444 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2454 /* Must not happen? */
2460 fetch_token (token, regexp, syntax);
2462 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2463 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2465 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2466 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2468 /* In BRE consecutive duplications are not allowed. */
2469 if ((syntax & RE_CONTEXT_INVALID_DUP)
2470 && (token->type == OP_DUP_ASTERISK
2471 || token->type == OP_OPEN_DUP_NUM))
2481 /* This function build the following tree, from regular expression
2489 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2490 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2492 re_dfa_t *dfa = preg->buffer;
2495 cur_nsub = preg->re_nsub++;
2497 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2499 /* The subexpression may be a null string. */
2500 if (token->type == OP_CLOSE_SUBEXP)
2504 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2505 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2508 postorder (tree, free_tree, NULL);
2511 if (BE (*err != REG_NOERROR, 0))
2515 if (cur_nsub <= '9' - '1')
2516 dfa->completed_bkref_map |= 1 << cur_nsub;
2518 tree = create_tree (dfa, tree, NULL, SUBEXP);
2519 if (BE (tree == NULL, 0))
2524 tree->token.opr.idx = cur_nsub;
2528 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2531 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2532 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2534 bin_tree_t *tree = NULL, *old_tree = NULL;
2535 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2536 re_token_t start_token = *token;
2538 if (token->type == OP_OPEN_DUP_NUM)
2541 start = fetch_number (regexp, token, syntax);
2542 if (start == REG_MISSING)
2544 if (token->type == CHARACTER && token->opr.c == ',')
2545 start = 0; /* We treat "{,m}" as "{0,m}". */
2548 *err = REG_BADBR; /* <re>{} is invalid. */
2552 if (BE (start != REG_ERROR, 1))
2554 /* We treat "{n}" as "{n,n}". */
2555 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2556 : ((token->type == CHARACTER && token->opr.c == ',')
2557 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2559 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2561 /* Invalid sequence. */
2562 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2564 if (token->type == END_OF_RE)
2572 /* If the syntax bit is set, rollback. */
2573 re_string_set_index (regexp, start_idx);
2574 *token = start_token;
2575 token->type = CHARACTER;
2576 /* mb_partial and word_char bits should be already initialized by
2581 if (BE ((end != REG_MISSING && start > end)
2582 || token->type != OP_CLOSE_DUP_NUM, 0))
2584 /* First number greater than second. */
2589 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2597 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2598 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2601 fetch_token (token, regexp, syntax);
2603 if (BE (elem == NULL, 0))
2605 if (BE (start == 0 && end == 0, 0))
2607 postorder (elem, free_tree, NULL);
2611 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2612 if (BE (start > 0, 0))
2615 for (i = 2; i <= start; ++i)
2617 elem = duplicate_tree (elem, dfa);
2618 tree = create_tree (dfa, tree, elem, CONCAT);
2619 if (BE (elem == NULL || tree == NULL, 0))
2620 goto parse_dup_op_espace;
2626 /* Duplicate ELEM before it is marked optional. */
2627 elem = duplicate_tree (elem, dfa);
2633 if (elem->token.type == SUBEXP)
2635 uintptr_t subidx = elem->token.opr.idx;
2636 postorder (elem, mark_opt_subexp, (void *) subidx);
2639 tree = create_tree (dfa, elem, NULL,
2640 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2641 if (BE (tree == NULL, 0))
2642 goto parse_dup_op_espace;
2644 /* From gnulib's "intprops.h":
2645 True if the arithmetic type T is signed. */
2646 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2648 /* This loop is actually executed only when end != REG_MISSING,
2649 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2650 already created the start+1-th copy. */
2651 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2652 for (i = start + 2; i <= end; ++i)
2654 elem = duplicate_tree (elem, dfa);
2655 tree = create_tree (dfa, tree, elem, CONCAT);
2656 if (BE (elem == NULL || tree == NULL, 0))
2657 goto parse_dup_op_espace;
2659 tree = create_tree (dfa, tree, NULL, OP_ALT);
2660 if (BE (tree == NULL, 0))
2661 goto parse_dup_op_espace;
2665 tree = create_tree (dfa, old_tree, tree, CONCAT);
2669 parse_dup_op_espace:
2674 /* Size of the names for collating symbol/equivalence_class/character_class.
2675 I'm not sure, but maybe enough. */
2676 #define BRACKET_NAME_BUF_SIZE 32
2679 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2680 Build the range expression which starts from START_ELEM, and ends
2681 at END_ELEM. The result are written to MBCSET and SBCSET.
2682 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2683 mbcset->range_ends, is a pointer argument since we may
2686 static reg_errcode_t
2688 # ifdef RE_ENABLE_I18N
2689 build_range_exp (const reg_syntax_t syntax,
2691 re_charset_t *mbcset,
2693 const bracket_elem_t *start_elem,
2694 const bracket_elem_t *end_elem)
2695 # else /* not RE_ENABLE_I18N */
2696 build_range_exp (const reg_syntax_t syntax,
2698 const bracket_elem_t *start_elem,
2699 const bracket_elem_t *end_elem)
2700 # endif /* not RE_ENABLE_I18N */
2702 unsigned int start_ch, end_ch;
2703 /* Equivalence Classes and Character Classes can't be a range start/end. */
2704 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2705 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2709 /* We can handle no multi character collating elements without libc
2711 if (BE ((start_elem->type == COLL_SYM
2712 && strlen ((char *) start_elem->opr.name) > 1)
2713 || (end_elem->type == COLL_SYM
2714 && strlen ((char *) end_elem->opr.name) > 1), 0))
2715 return REG_ECOLLATE;
2717 # ifdef RE_ENABLE_I18N
2723 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2724 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2727 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2730 ? __btowc (start_ch) : start_elem->opr.wch);
2731 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2732 ? __btowc (end_ch) : end_elem->opr.wch);
2733 if (start_wc == WEOF || end_wc == WEOF)
2734 return REG_ECOLLATE;
2735 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2738 /* Got valid collation sequence values, add them as a new entry.
2739 However, for !_LIBC we have no collation elements: if the
2740 character set is single byte, the single byte character set
2741 that we build below suffices. parse_bracket_exp passes
2742 no MBCSET if dfa->mb_cur_max == 1. */
2745 /* Check the space of the arrays. */
2746 if (BE (*range_alloc == mbcset->nranges, 0))
2748 /* There is not enough space, need realloc. */
2749 wchar_t *new_array_start, *new_array_end;
2752 /* +1 in case of mbcset->nranges is 0. */
2753 new_nranges = 2 * mbcset->nranges + 1;
2754 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2755 are NULL if *range_alloc == 0. */
2756 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2758 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2761 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2764 mbcset->range_starts = new_array_start;
2765 mbcset->range_ends = new_array_end;
2766 *range_alloc = new_nranges;
2769 mbcset->range_starts[mbcset->nranges] = start_wc;
2770 mbcset->range_ends[mbcset->nranges++] = end_wc;
2773 /* Build the table for single byte characters. */
2774 for (wc = 0; wc < SBC_MAX; ++wc)
2776 if (start_wc <= wc && wc <= end_wc)
2777 bitset_set (sbcset, wc);
2780 # else /* not RE_ENABLE_I18N */
2783 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2784 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2786 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2787 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2789 if (start_ch > end_ch)
2791 /* Build the table for single byte characters. */
2792 for (ch = 0; ch < SBC_MAX; ++ch)
2793 if (start_ch <= ch && ch <= end_ch)
2794 bitset_set (sbcset, ch);
2796 # endif /* not RE_ENABLE_I18N */
2799 #endif /* not _LIBC */
2802 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2803 Build the collating element which is represented by NAME.
2804 The result are written to MBCSET and SBCSET.
2805 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2806 pointer argument since we may update it. */
2808 static reg_errcode_t
2810 # ifdef RE_ENABLE_I18N
2811 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2812 Idx *coll_sym_alloc, const unsigned char *name)
2813 # else /* not RE_ENABLE_I18N */
2814 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2815 # endif /* not RE_ENABLE_I18N */
2817 size_t name_len = strlen ((const char *) name);
2818 if (BE (name_len != 1, 0))
2819 return REG_ECOLLATE;
2822 bitset_set (sbcset, name[0]);
2826 #endif /* not _LIBC */
2828 /* This function parse bracket expression like "[abc]", "[a-c]",
2832 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2833 reg_syntax_t syntax, reg_errcode_t *err)
2836 const unsigned char *collseqmb;
2837 const char *collseqwc;
2840 const int32_t *symb_table;
2841 const unsigned char *extra;
2843 /* Local function for parse_bracket_exp used in _LIBC environment.
2844 Seek the collating symbol entry corresponding to NAME.
2845 Return the index of the symbol in the SYMB_TABLE,
2846 or -1 if not found. */
2849 __attribute__ ((always_inline))
2850 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2854 for (elem = 0; elem < table_size; elem++)
2855 if (symb_table[2 * elem] != 0)
2857 int32_t idx = symb_table[2 * elem + 1];
2858 /* Skip the name of collating element name. */
2859 idx += 1 + extra[idx];
2860 if (/* Compare the length of the name. */
2861 name_len == extra[idx]
2862 /* Compare the name. */
2863 && memcmp (name, &extra[idx + 1], name_len) == 0)
2864 /* Yep, this is the entry. */
2870 /* Local function for parse_bracket_exp used in _LIBC environment.
2871 Look up the collation sequence value of BR_ELEM.
2872 Return the value if succeeded, UINT_MAX otherwise. */
2874 auto inline unsigned int
2875 __attribute__ ((always_inline))
2876 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2878 if (br_elem->type == SB_CHAR)
2881 if (MB_CUR_MAX == 1)
2884 return collseqmb[br_elem->opr.ch];
2887 wint_t wc = __btowc (br_elem->opr.ch);
2888 return __collseq_table_lookup (collseqwc, wc);
2891 else if (br_elem->type == MB_CHAR)
2894 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2896 else if (br_elem->type == COLL_SYM)
2898 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2902 elem = seek_collating_symbol_entry (br_elem->opr.name,
2906 /* We found the entry. */
2907 idx = symb_table[2 * elem + 1];
2908 /* Skip the name of collating element name. */
2909 idx += 1 + extra[idx];
2910 /* Skip the byte sequence of the collating element. */
2911 idx += 1 + extra[idx];
2912 /* Adjust for the alignment. */
2913 idx = (idx + 3) & ~3;
2914 /* Skip the multibyte collation sequence value. */
2915 idx += sizeof (unsigned int);
2916 /* Skip the wide char sequence of the collating element. */
2917 idx += sizeof (unsigned int) *
2918 (1 + *(unsigned int *) (extra + idx));
2919 /* Return the collation sequence value. */
2920 return *(unsigned int *) (extra + idx);
2922 else if (sym_name_len == 1)
2924 /* No valid character. Match it as a single byte
2926 return collseqmb[br_elem->opr.name[0]];
2929 else if (sym_name_len == 1)
2930 return collseqmb[br_elem->opr.name[0]];
2935 /* Local function for parse_bracket_exp used in _LIBC environment.
2936 Build the range expression which starts from START_ELEM, and ends
2937 at END_ELEM. The result are written to MBCSET and SBCSET.
2938 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2939 mbcset->range_ends, is a pointer argument since we may
2942 auto inline reg_errcode_t
2943 __attribute__ ((always_inline))
2944 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2945 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2948 uint32_t start_collseq;
2949 uint32_t end_collseq;
2951 /* Equivalence Classes and Character Classes can't be a range
2953 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2954 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2958 /* FIXME: Implement rational ranges here, too. */
2959 start_collseq = lookup_collation_sequence_value (start_elem);
2960 end_collseq = lookup_collation_sequence_value (end_elem);
2961 /* Check start/end collation sequence values. */
2962 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2963 return REG_ECOLLATE;
2964 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2967 /* Got valid collation sequence values, add them as a new entry.
2968 However, if we have no collation elements, and the character set
2969 is single byte, the single byte character set that we
2970 build below suffices. */
2971 if (nrules > 0 || dfa->mb_cur_max > 1)
2973 /* Check the space of the arrays. */
2974 if (BE (*range_alloc == mbcset->nranges, 0))
2976 /* There is not enough space, need realloc. */
2977 uint32_t *new_array_start;
2978 uint32_t *new_array_end;
2981 /* +1 in case of mbcset->nranges is 0. */
2982 new_nranges = 2 * mbcset->nranges + 1;
2983 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2985 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2988 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2991 mbcset->range_starts = new_array_start;
2992 mbcset->range_ends = new_array_end;
2993 *range_alloc = new_nranges;
2996 mbcset->range_starts[mbcset->nranges] = start_collseq;
2997 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3000 /* Build the table for single byte characters. */
3001 for (ch = 0; ch < SBC_MAX; ch++)
3003 uint32_t ch_collseq;
3005 if (MB_CUR_MAX == 1)
3008 ch_collseq = collseqmb[ch];
3010 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3011 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3012 bitset_set (sbcset, ch);
3017 /* Local function for parse_bracket_exp used in _LIBC environment.
3018 Build the collating element which is represented by NAME.
3019 The result are written to MBCSET and SBCSET.
3020 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3021 pointer argument since we may update it. */
3023 auto inline reg_errcode_t
3024 __attribute__ ((always_inline))
3025 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3026 Idx *coll_sym_alloc, const unsigned char *name)
3029 size_t name_len = strlen ((const char *) name);
3032 elem = seek_collating_symbol_entry (name, name_len);
3035 /* We found the entry. */
3036 idx = symb_table[2 * elem + 1];
3037 /* Skip the name of collating element name. */
3038 idx += 1 + extra[idx];
3040 else if (name_len == 1)
3042 /* No valid character, treat it as a normal
3044 bitset_set (sbcset, name[0]);
3048 return REG_ECOLLATE;
3050 /* Got valid collation sequence, add it as a new entry. */
3051 /* Check the space of the arrays. */
3052 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3054 /* Not enough, realloc it. */
3055 /* +1 in case of mbcset->ncoll_syms is 0. */
3056 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3057 /* Use realloc since mbcset->coll_syms is NULL
3059 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3060 new_coll_sym_alloc);
3061 if (BE (new_coll_syms == NULL, 0))
3063 mbcset->coll_syms = new_coll_syms;
3064 *coll_sym_alloc = new_coll_sym_alloc;
3066 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3071 if (BE (name_len != 1, 0))
3072 return REG_ECOLLATE;
3075 bitset_set (sbcset, name[0]);
3082 re_token_t br_token;
3083 re_bitset_ptr_t sbcset;
3084 #ifdef RE_ENABLE_I18N
3085 re_charset_t *mbcset;
3086 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3087 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3088 #endif /* not RE_ENABLE_I18N */
3089 bool non_match = false;
3090 bin_tree_t *work_tree;
3092 bool first_round = true;
3094 collseqmb = (const unsigned char *)
3095 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3096 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3102 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3103 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3104 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3105 _NL_COLLATE_SYMB_TABLEMB);
3106 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3107 _NL_COLLATE_SYMB_EXTRAMB);
3110 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3111 #ifdef RE_ENABLE_I18N
3112 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3113 #endif /* RE_ENABLE_I18N */
3114 #ifdef RE_ENABLE_I18N
3115 if (BE (sbcset == NULL || mbcset == NULL, 0))
3117 if (BE (sbcset == NULL, 0))
3118 #endif /* RE_ENABLE_I18N */
3121 #ifdef RE_ENABLE_I18N
3128 token_len = peek_token_bracket (token, regexp, syntax);
3129 if (BE (token->type == END_OF_RE, 0))
3132 goto parse_bracket_exp_free_return;
3134 if (token->type == OP_NON_MATCH_LIST)
3136 #ifdef RE_ENABLE_I18N
3137 mbcset->non_match = 1;
3138 #endif /* not RE_ENABLE_I18N */
3140 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3141 bitset_set (sbcset, '\n');
3142 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3143 token_len = peek_token_bracket (token, regexp, syntax);
3144 if (BE (token->type == END_OF_RE, 0))
3147 goto parse_bracket_exp_free_return;
3151 /* We treat the first ']' as a normal character. */
3152 if (token->type == OP_CLOSE_BRACKET)
3153 token->type = CHARACTER;
3157 bracket_elem_t start_elem, end_elem;
3158 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3159 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3162 bool is_range_exp = false;
3165 start_elem.opr.name = start_name_buf;
3166 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3167 syntax, first_round);
3168 if (BE (ret != REG_NOERROR, 0))
3171 goto parse_bracket_exp_free_return;
3173 first_round = false;
3175 /* Get information about the next token. We need it in any case. */
3176 token_len = peek_token_bracket (token, regexp, syntax);
3178 /* Do not check for ranges if we know they are not allowed. */
3179 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3181 if (BE (token->type == END_OF_RE, 0))
3184 goto parse_bracket_exp_free_return;
3186 if (token->type == OP_CHARSET_RANGE)
3188 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3189 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3190 if (BE (token2.type == END_OF_RE, 0))
3193 goto parse_bracket_exp_free_return;
3195 if (token2.type == OP_CLOSE_BRACKET)
3197 /* We treat the last '-' as a normal character. */
3198 re_string_skip_bytes (regexp, -token_len);
3199 token->type = CHARACTER;
3202 is_range_exp = true;
3206 if (is_range_exp == true)
3208 end_elem.opr.name = end_name_buf;
3209 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3211 if (BE (ret != REG_NOERROR, 0))
3214 goto parse_bracket_exp_free_return;
3217 token_len = peek_token_bracket (token, regexp, syntax);
3220 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3221 &start_elem, &end_elem);
3223 # ifdef RE_ENABLE_I18N
3224 *err = build_range_exp (syntax, sbcset,
3225 dfa->mb_cur_max > 1 ? mbcset : NULL,
3226 &range_alloc, &start_elem, &end_elem);
3228 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3230 #endif /* RE_ENABLE_I18N */
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3236 switch (start_elem.type)
3239 bitset_set (sbcset, start_elem.opr.ch);
3241 #ifdef RE_ENABLE_I18N
3243 /* Check whether the array has enough space. */
3244 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3246 wchar_t *new_mbchars;
3247 /* Not enough, realloc it. */
3248 /* +1 in case of mbcset->nmbchars is 0. */
3249 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3250 /* Use realloc since array is NULL if *alloc == 0. */
3251 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3253 if (BE (new_mbchars == NULL, 0))
3254 goto parse_bracket_exp_espace;
3255 mbcset->mbchars = new_mbchars;
3257 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3259 #endif /* RE_ENABLE_I18N */
3261 *err = build_equiv_class (sbcset,
3262 #ifdef RE_ENABLE_I18N
3263 mbcset, &equiv_class_alloc,
3264 #endif /* RE_ENABLE_I18N */
3265 start_elem.opr.name);
3266 if (BE (*err != REG_NOERROR, 0))
3267 goto parse_bracket_exp_free_return;
3270 *err = build_collating_symbol (sbcset,
3271 #ifdef RE_ENABLE_I18N
3272 mbcset, &coll_sym_alloc,
3273 #endif /* RE_ENABLE_I18N */
3274 start_elem.opr.name);
3275 if (BE (*err != REG_NOERROR, 0))
3276 goto parse_bracket_exp_free_return;
3279 *err = build_charclass (regexp->trans, sbcset,
3280 #ifdef RE_ENABLE_I18N
3281 mbcset, &char_class_alloc,
3282 #endif /* RE_ENABLE_I18N */
3283 (const char *) start_elem.opr.name,
3285 if (BE (*err != REG_NOERROR, 0))
3286 goto parse_bracket_exp_free_return;
3293 if (BE (token->type == END_OF_RE, 0))
3296 goto parse_bracket_exp_free_return;
3298 if (token->type == OP_CLOSE_BRACKET)
3302 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3304 /* If it is non-matching list. */
3306 bitset_not (sbcset);
3308 #ifdef RE_ENABLE_I18N
3309 /* Ensure only single byte characters are set. */
3310 if (dfa->mb_cur_max > 1)
3311 bitset_mask (sbcset, dfa->sb_char);
3313 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3314 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3315 || mbcset->non_match)))
3317 bin_tree_t *mbc_tree;
3319 /* Build a tree for complex bracket. */
3320 dfa->has_mb_node = 1;
3321 br_token.type = COMPLEX_BRACKET;
3322 br_token.opr.mbcset = mbcset;
3323 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3324 if (BE (mbc_tree == NULL, 0))
3325 goto parse_bracket_exp_espace;
3326 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3327 if (sbcset[sbc_idx])
3329 /* If there are no bits set in sbcset, there is no point
3330 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3331 if (sbc_idx < BITSET_WORDS)
3333 /* Build a tree for simple bracket. */
3334 br_token.type = SIMPLE_BRACKET;
3335 br_token.opr.sbcset = sbcset;
3336 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3337 if (BE (work_tree == NULL, 0))
3338 goto parse_bracket_exp_espace;
3340 /* Then join them by ALT node. */
3341 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3342 if (BE (work_tree == NULL, 0))
3343 goto parse_bracket_exp_espace;
3348 work_tree = mbc_tree;
3352 #endif /* not RE_ENABLE_I18N */
3354 #ifdef RE_ENABLE_I18N
3355 free_charset (mbcset);
3357 /* Build a tree for simple bracket. */
3358 br_token.type = SIMPLE_BRACKET;
3359 br_token.opr.sbcset = sbcset;
3360 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3361 if (BE (work_tree == NULL, 0))
3362 goto parse_bracket_exp_espace;
3366 parse_bracket_exp_espace:
3368 parse_bracket_exp_free_return:
3370 #ifdef RE_ENABLE_I18N
3371 free_charset (mbcset);
3372 #endif /* RE_ENABLE_I18N */
3376 /* Parse an element in the bracket expression. */
3378 static reg_errcode_t
3379 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3380 re_token_t *token, int token_len, re_dfa_t *dfa,
3381 reg_syntax_t syntax, bool accept_hyphen)
3383 #ifdef RE_ENABLE_I18N
3385 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3386 if (cur_char_size > 1)
3388 elem->type = MB_CHAR;
3389 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3390 re_string_skip_bytes (regexp, cur_char_size);
3393 #endif /* RE_ENABLE_I18N */
3394 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3395 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3396 || token->type == OP_OPEN_EQUIV_CLASS)
3397 return parse_bracket_symbol (elem, regexp, token);
3398 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3400 /* A '-' must only appear as anything but a range indicator before
3401 the closing bracket. Everything else is an error. */
3403 (void) peek_token_bracket (&token2, regexp, syntax);
3404 if (token2.type != OP_CLOSE_BRACKET)
3405 /* The actual error value is not standardized since this whole
3406 case is undefined. But ERANGE makes good sense. */
3409 elem->type = SB_CHAR;
3410 elem->opr.ch = token->opr.c;
3414 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3415 such as [:<character_class>:], [.<collating_element>.], and
3416 [=<equivalent_class>=]. */
3418 static reg_errcode_t
3419 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3422 unsigned char ch, delim = token->opr.c;
3424 if (re_string_eoi(regexp))
3428 if (i >= BRACKET_NAME_BUF_SIZE)
3430 if (token->type == OP_OPEN_CHAR_CLASS)
3431 ch = re_string_fetch_byte_case (regexp);
3433 ch = re_string_fetch_byte (regexp);
3434 if (re_string_eoi(regexp))
3436 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3438 elem->opr.name[i] = ch;
3440 re_string_skip_bytes (regexp, 1);
3441 elem->opr.name[i] = '\0';
3442 switch (token->type)
3444 case OP_OPEN_COLL_ELEM:
3445 elem->type = COLL_SYM;
3447 case OP_OPEN_EQUIV_CLASS:
3448 elem->type = EQUIV_CLASS;
3450 case OP_OPEN_CHAR_CLASS:
3451 elem->type = CHAR_CLASS;
3459 /* Helper function for parse_bracket_exp.
3460 Build the equivalence class which is represented by NAME.
3461 The result are written to MBCSET and SBCSET.
3462 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3463 is a pointer argument since we may update it. */
3465 static reg_errcode_t
3466 #ifdef RE_ENABLE_I18N
3467 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3468 Idx *equiv_class_alloc, const unsigned char *name)
3469 #else /* not RE_ENABLE_I18N */
3470 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3471 #endif /* not RE_ENABLE_I18N */
3474 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3477 const int32_t *table, *indirect;
3478 const unsigned char *weights, *extra, *cp;
3479 unsigned char char_buf[2];
3483 /* This #include defines a local function! */
3484 # include <locale/weight.h>
3485 /* Calculate the index for equivalence class. */
3487 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3488 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489 _NL_COLLATE_WEIGHTMB);
3490 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3491 _NL_COLLATE_EXTRAMB);
3492 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3493 _NL_COLLATE_INDIRECTMB);
3494 idx1 = findidx (&cp, -1);
3495 if (BE (idx1 == 0 || *cp != '\0', 0))
3496 /* This isn't a valid character. */
3497 return REG_ECOLLATE;
3499 /* Build single byte matching table for this equivalence class. */
3500 len = weights[idx1 & 0xffffff];
3501 for (ch = 0; ch < SBC_MAX; ++ch)
3505 idx2 = findidx (&cp, 1);
3510 /* This isn't a valid character. */
3512 /* Compare only if the length matches and the collation rule
3513 index is the same. */
3514 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3518 while (cnt <= len &&
3519 weights[(idx1 & 0xffffff) + 1 + cnt]
3520 == weights[(idx2 & 0xffffff) + 1 + cnt])
3524 bitset_set (sbcset, ch);
3527 /* Check whether the array has enough space. */
3528 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3530 /* Not enough, realloc it. */
3531 /* +1 in case of mbcset->nequiv_classes is 0. */
3532 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3533 /* Use realloc since the array is NULL if *alloc == 0. */
3534 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3536 new_equiv_class_alloc);
3537 if (BE (new_equiv_classes == NULL, 0))
3539 mbcset->equiv_classes = new_equiv_classes;
3540 *equiv_class_alloc = new_equiv_class_alloc;
3542 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3547 if (BE (strlen ((const char *) name) != 1, 0))
3548 return REG_ECOLLATE;
3549 bitset_set (sbcset, *name);
3554 /* Helper function for parse_bracket_exp.
3555 Build the character class which is represented by NAME.
3556 The result are written to MBCSET and SBCSET.
3557 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3558 is a pointer argument since we may update it. */
3560 static reg_errcode_t
3561 #ifdef RE_ENABLE_I18N
3562 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3563 re_charset_t *mbcset, Idx *char_class_alloc,
3564 const char *class_name, reg_syntax_t syntax)
3565 #else /* not RE_ENABLE_I18N */
3566 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567 const char *class_name, reg_syntax_t syntax)
3568 #endif /* not RE_ENABLE_I18N */
3571 const char *name = class_name;
3573 /* In case of REG_ICASE "upper" and "lower" match the both of
3574 upper and lower cases. */
3575 if ((syntax & RE_ICASE)
3576 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3579 #ifdef RE_ENABLE_I18N
3580 /* Check the space of the arrays. */
3581 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3583 /* Not enough, realloc it. */
3584 /* +1 in case of mbcset->nchar_classes is 0. */
3585 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3586 /* Use realloc since array is NULL if *alloc == 0. */
3587 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3588 new_char_class_alloc);
3589 if (BE (new_char_classes == NULL, 0))
3591 mbcset->char_classes = new_char_classes;
3592 *char_class_alloc = new_char_class_alloc;
3594 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3595 #endif /* RE_ENABLE_I18N */
3597 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3599 if (BE (trans != NULL, 0)) \
3601 for (i = 0; i < SBC_MAX; ++i) \
3602 if (ctype_func (i)) \
3603 bitset_set (sbcset, trans[i]); \
3607 for (i = 0; i < SBC_MAX; ++i) \
3608 if (ctype_func (i)) \
3609 bitset_set (sbcset, i); \
3613 if (strcmp (name, "alnum") == 0)
3614 BUILD_CHARCLASS_LOOP (isalnum);
3615 else if (strcmp (name, "cntrl") == 0)
3616 BUILD_CHARCLASS_LOOP (iscntrl);
3617 else if (strcmp (name, "lower") == 0)
3618 BUILD_CHARCLASS_LOOP (islower);
3619 else if (strcmp (name, "space") == 0)
3620 BUILD_CHARCLASS_LOOP (isspace);
3621 else if (strcmp (name, "alpha") == 0)
3622 BUILD_CHARCLASS_LOOP (isalpha);
3623 else if (strcmp (name, "digit") == 0)
3624 BUILD_CHARCLASS_LOOP (isdigit);
3625 else if (strcmp (name, "print") == 0)
3626 BUILD_CHARCLASS_LOOP (isprint);
3627 else if (strcmp (name, "upper") == 0)
3628 BUILD_CHARCLASS_LOOP (isupper);
3629 else if (strcmp (name, "blank") == 0)
3630 BUILD_CHARCLASS_LOOP (isblank);
3631 else if (strcmp (name, "graph") == 0)
3632 BUILD_CHARCLASS_LOOP (isgraph);
3633 else if (strcmp (name, "punct") == 0)
3634 BUILD_CHARCLASS_LOOP (ispunct);
3635 else if (strcmp (name, "xdigit") == 0)
3636 BUILD_CHARCLASS_LOOP (isxdigit);
3644 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3645 const char *class_name,
3646 const char *extra, bool non_match,
3649 re_bitset_ptr_t sbcset;
3650 #ifdef RE_ENABLE_I18N
3651 re_charset_t *mbcset;
3653 #endif /* not RE_ENABLE_I18N */
3655 re_token_t br_token;
3658 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3659 #ifdef RE_ENABLE_I18N
3660 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3661 #endif /* RE_ENABLE_I18N */
3663 #ifdef RE_ENABLE_I18N
3664 if (BE (sbcset == NULL || mbcset == NULL, 0))
3665 #else /* not RE_ENABLE_I18N */
3666 if (BE (sbcset == NULL, 0))
3667 #endif /* not RE_ENABLE_I18N */
3675 #ifdef RE_ENABLE_I18N
3676 mbcset->non_match = 1;
3677 #endif /* not RE_ENABLE_I18N */
3680 /* We don't care the syntax in this case. */
3681 ret = build_charclass (trans, sbcset,
3682 #ifdef RE_ENABLE_I18N
3684 #endif /* RE_ENABLE_I18N */
3687 if (BE (ret != REG_NOERROR, 0))
3690 #ifdef RE_ENABLE_I18N
3691 free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3696 /* \w match '_' also. */
3697 for (; *extra; extra++)
3698 bitset_set (sbcset, *extra);
3700 /* If it is non-matching list. */
3702 bitset_not (sbcset);
3704 #ifdef RE_ENABLE_I18N
3705 /* Ensure only single byte characters are set. */
3706 if (dfa->mb_cur_max > 1)
3707 bitset_mask (sbcset, dfa->sb_char);
3710 /* Build a tree for simple bracket. */
3711 br_token.type = SIMPLE_BRACKET;
3712 br_token.opr.sbcset = sbcset;
3713 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3714 if (BE (tree == NULL, 0))
3715 goto build_word_op_espace;
3717 #ifdef RE_ENABLE_I18N
3718 if (dfa->mb_cur_max > 1)
3720 bin_tree_t *mbc_tree;
3721 /* Build a tree for complex bracket. */
3722 br_token.type = COMPLEX_BRACKET;
3723 br_token.opr.mbcset = mbcset;
3724 dfa->has_mb_node = 1;
3725 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726 if (BE (mbc_tree == NULL, 0))
3727 goto build_word_op_espace;
3728 /* Then join them by ALT node. */
3729 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3730 if (BE (mbc_tree != NULL, 1))
3735 free_charset (mbcset);
3738 #else /* not RE_ENABLE_I18N */
3740 #endif /* not RE_ENABLE_I18N */
3742 build_word_op_espace:
3744 #ifdef RE_ENABLE_I18N
3745 free_charset (mbcset);
3746 #endif /* RE_ENABLE_I18N */
3751 /* This is intended for the expressions like "a{1,3}".
3752 Fetch a number from 'input', and return the number.
3753 Return REG_MISSING if the number field is empty like "{,1}".
3754 Return RE_DUP_MAX + 1 if the number field is too large.
3755 Return REG_ERROR if an error occurred. */
3758 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3760 Idx num = REG_MISSING;
3764 fetch_token (token, input, syntax);
3766 if (BE (token->type == END_OF_RE, 0))
3768 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3770 num = ((token->type != CHARACTER || c < '0' || '9' < c
3771 || num == REG_ERROR)
3773 : num == REG_MISSING
3775 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3780 #ifdef RE_ENABLE_I18N
3782 free_charset (re_charset_t *cset)
3784 re_free (cset->mbchars);
3786 re_free (cset->coll_syms);
3787 re_free (cset->equiv_classes);
3788 re_free (cset->range_starts);
3789 re_free (cset->range_ends);
3791 re_free (cset->char_classes);
3794 #endif /* RE_ENABLE_I18N */
3796 /* Functions for binary tree operation. */
3798 /* Create a tree node. */
3801 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3802 re_token_type_t type)
3806 return create_token_tree (dfa, left, right, &t);
3810 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3811 const re_token_t *token)
3814 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3816 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3818 if (storage == NULL)
3820 storage->next = dfa->str_tree_storage;
3821 dfa->str_tree_storage = storage;
3822 dfa->str_tree_storage_idx = 0;
3824 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3826 tree->parent = NULL;
3828 tree->right = right;
3829 tree->token = *token;
3830 tree->token.duplicated = 0;
3831 tree->token.opt_subexp = 0;
3834 tree->node_idx = REG_MISSING;
3837 left->parent = tree;
3839 right->parent = tree;
3843 /* Mark the tree SRC as an optional subexpression.
3844 To be called from preorder or postorder. */
3846 static reg_errcode_t
3847 mark_opt_subexp (void *extra, bin_tree_t *node)
3849 Idx idx = (uintptr_t) extra;
3850 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3851 node->token.opt_subexp = 1;
3856 /* Free the allocated memory inside NODE. */
3859 free_token (re_token_t *node)
3861 #ifdef RE_ENABLE_I18N
3862 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3863 free_charset (node->opr.mbcset);
3865 #endif /* RE_ENABLE_I18N */
3866 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3867 re_free (node->opr.sbcset);
3870 /* Worker function for tree walking. Free the allocated memory inside NODE
3871 and its children. */
3873 static reg_errcode_t
3874 free_tree (void *extra, bin_tree_t *node)
3876 free_token (&node->token);
3881 /* Duplicate the node SRC, and return new node. This is a preorder
3882 visit similar to the one implemented by the generic visitor, but
3883 we need more infrastructure to maintain two parallel trees --- so,
3884 it's easier to duplicate. */
3887 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3889 const bin_tree_t *node;
3890 bin_tree_t *dup_root;
3891 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3893 for (node = root; ; )
3895 /* Create a new tree and link it back to the current parent. */
3896 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3899 (*p_new)->parent = dup_node;
3900 (*p_new)->token.duplicated = 1;
3903 /* Go to the left node, or up and to the right. */
3907 p_new = &dup_node->left;
3911 const bin_tree_t *prev = NULL;
3912 while (node->right == prev || node->right == NULL)
3915 node = node->parent;
3916 dup_node = dup_node->parent;
3921 p_new = &dup_node->right;