1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002-2011 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License along
19 with this program; if not, write to the Free Software Foundation,
20 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 const re_dfastate_t *init_state,
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 static void optimize_utf8 (re_dfa_t *dfa);
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
52 unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 static int peek_token (re_token_t *token, re_string_t *input,
60 reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 Idx nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 re_dfa_t *dfa, re_token_t *token,
77 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 re_token_t *token, reg_syntax_t syntax,
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_token_t *token, int token_len,
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 Idx *equiv_class_alloc,
94 const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 Idx *char_class_alloc,
99 const unsigned char *class_name,
100 reg_syntax_t syntax);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 const unsigned char *class_name,
107 reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 RE_TRANSLATE_TYPE trans,
111 const unsigned char *class_name,
112 const unsigned char *extra,
113 bool non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 bin_tree_t *left, bin_tree_t *right,
119 const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 static const char __re_error_msgid[] =
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 static const size_t __re_error_msgid_idx[] =
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
216 re_compile_pattern (pattern, length, bufp)
219 struct re_pattern_buffer *bufp;
220 #else /* size_t might promote */
222 re_compile_pattern (const char *pattern, size_t length,
223 struct re_pattern_buffer *bufp)
228 /* And GNU code determines whether or not to get register information
229 by passing null for the REGS argument to re_match, etc., not by
230 setting no_sub, unless RE_NO_SUB is set. */
231 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233 /* Match anchors at newline. */
234 bufp->newline_anchor = 1;
236 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
240 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
243 weak_alias (__re_compile_pattern, re_compile_pattern)
246 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
247 also be assigned to arbitrarily: each pattern buffer stores its own
248 syntax, so it can be changed between regex compilations. */
249 /* This has no initializer because initialized variables in Emacs
250 become read-only after dumping. */
251 reg_syntax_t re_syntax_options;
254 /* Specify the precise syntax of regexps for compilation. This provides
255 for compatibility for various utilities which historically have
256 different, incompatible syntaxes.
258 The argument SYNTAX is a bit mask comprised of the various bits
259 defined in regex.h. We return the old syntax. */
262 re_set_syntax (syntax)
265 reg_syntax_t ret = re_syntax_options;
267 re_syntax_options = syntax;
271 weak_alias (__re_set_syntax, re_set_syntax)
275 re_compile_fastmap (bufp)
276 struct re_pattern_buffer *bufp;
278 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
279 char *fastmap = bufp->fastmap;
281 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
282 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
283 if (dfa->init_state != dfa->init_state_word)
284 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
285 if (dfa->init_state != dfa->init_state_nl)
286 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
287 if (dfa->init_state != dfa->init_state_begbuf)
288 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
289 bufp->fastmap_accurate = 1;
293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
297 __attribute ((always_inline))
298 re_set_fastmap (char *fastmap, bool icase, int ch)
302 fastmap[tolower (ch)] = 1;
305 /* Helper function for re_compile_fastmap.
306 Compile fastmap for the initial_state INIT_STATE. */
309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
312 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
314 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
315 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317 Idx node = init_state->nodes.elems[node_cnt];
318 re_token_type_t type = dfa->nodes[node].type;
320 if (type == CHARACTER)
322 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
323 #ifdef RE_ENABLE_I18N
324 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 unsigned char buf[MB_LEN_MAX];
332 *p++ = dfa->nodes[node].opr.c;
333 while (++node < dfa->nodes_len
334 && dfa->nodes[node].type == CHARACTER
335 && dfa->nodes[node].mb_partial)
336 *p++ = dfa->nodes[node].opr.c;
337 memset (&state, '\0', sizeof (state));
338 if (__mbrtowc (&wc, (const char *) buf, p - buf,
340 && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 re_set_fastmap (fastmap, false, buf[0]);
346 else if (type == SIMPLE_BRACKET)
349 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
352 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
353 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
354 if (w & ((bitset_word_t) 1 << j))
355 re_set_fastmap (fastmap, icase, ch);
358 #ifdef RE_ENABLE_I18N
359 else if (type == COMPLEX_BRACKET)
361 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
365 /* See if we have to try all bytes which start multiple collation
367 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
368 collation element, and don't catch 'b' since 'b' is
369 the only collation element which starts from 'b' (and
370 it is caught by SIMPLE_BRACKET). */
371 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
372 && (cset->ncoll_syms || cset->nranges))
374 const int32_t *table = (const int32_t *)
375 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
376 for (i = 0; i < SBC_MAX; ++i)
378 re_set_fastmap (fastmap, icase, i);
382 /* See if we have to start the match at all multibyte characters,
383 i.e. where we would not find an invalid sequence. This only
384 applies to multibyte character sets; for single byte character
385 sets, the SIMPLE_BRACKET again suffices. */
386 if (dfa->mb_cur_max > 1
387 && (cset->nchar_classes || cset->non_match || cset->nranges
389 || cset->nequiv_classes
397 memset (&mbs, 0, sizeof (mbs));
398 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
399 re_set_fastmap (fastmap, false, (int) c);
406 /* ... Else catch all bytes which can start the mbchars. */
407 for (i = 0; i < cset->nmbchars; ++i)
411 memset (&state, '\0', sizeof (state));
412 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
423 #endif /* RE_ENABLE_I18N */
424 else if (type == OP_PERIOD
425 #ifdef RE_ENABLE_I18N
426 || type == OP_UTF8_PERIOD
427 #endif /* RE_ENABLE_I18N */
428 || type == END_OF_RE)
430 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431 if (type == END_OF_RE)
432 bufp->can_be_null = 1;
438 /* Entry point for POSIX code. */
439 /* regcomp takes a regular expression as a string and compiles it.
441 PREG is a regex_t *. We do not expect any fields to be initialized,
442 since POSIX says we shouldn't. Thus, we set
444 `buffer' to the compiled pattern;
445 `used' to the length of the compiled pattern;
446 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447 REG_EXTENDED bit in CFLAGS is set; otherwise, to
448 RE_SYNTAX_POSIX_BASIC;
449 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450 `fastmap' to an allocated space for the fastmap;
451 `fastmap_accurate' to zero;
452 `re_nsub' to the number of subexpressions in PATTERN.
454 PATTERN is the address of the pattern string.
456 CFLAGS is a series of bits which affect compilation.
458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459 use POSIX basic syntax.
461 If REG_NEWLINE is set, then . and [^...] don't match newline.
462 Also, regexec will try a match beginning after every newline.
464 If REG_ICASE is set, then we considers upper- and lowercase
465 versions of letters to be equivalent when matching.
467 If REG_NOSUB is set, then when PREG is passed to regexec, that
468 routine will report only success or failure, and nothing about the
471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472 the return codes and their meanings.) */
475 regcomp (preg, pattern, cflags)
476 regex_t *_Restrict_ preg;
477 const char *_Restrict_ pattern;
481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482 : RE_SYNTAX_POSIX_BASIC);
488 /* Try to allocate space for the fastmap. */
489 preg->fastmap = re_malloc (char, SBC_MAX);
490 if (BE (preg->fastmap == NULL, 0))
493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495 /* If REG_NEWLINE is set, newlines are treated differently. */
496 if (cflags & REG_NEWLINE)
497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
498 syntax &= ~RE_DOT_NEWLINE;
499 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500 /* It also changes the matching behavior. */
501 preg->newline_anchor = 1;
504 preg->newline_anchor = 0;
505 preg->no_sub = !!(cflags & REG_NOSUB);
506 preg->translate = NULL;
508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510 /* POSIX doesn't distinguish between an unmatched open-group and an
511 unmatched close-group: both are REG_EPAREN. */
512 if (ret == REG_ERPAREN)
515 /* We have already checked preg->fastmap != NULL. */
516 if (BE (ret == REG_NOERROR, 1))
517 /* Compute the fastmap now, since regexec cannot modify the pattern
518 buffer. This function never fails in this implementation. */
519 (void) re_compile_fastmap (preg);
522 /* Some error occurred while compiling the expression. */
523 re_free (preg->fastmap);
524 preg->fastmap = NULL;
530 weak_alias (__regcomp, regcomp)
533 /* Returns a message corresponding to an error code, ERRCODE, returned
534 from either regcomp or regexec. We don't use PREG here. */
538 regerror (errcode, preg, errbuf, errbuf_size)
540 const regex_t *_Restrict_ preg;
541 char *_Restrict_ errbuf;
543 #else /* size_t might promote */
545 regerror (int errcode, const regex_t *_Restrict_ preg,
546 char *_Restrict_ errbuf, size_t errbuf_size)
553 || errcode >= (int) (sizeof (__re_error_msgid_idx)
554 / sizeof (__re_error_msgid_idx[0])), 0))
555 /* Only error codes returned by the rest of the code should be passed
556 to this routine. If we are given anything else, or if other regex
557 code generates an invalid error code, then the program has a bug.
558 Dump core so we can fix it. */
561 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563 msg_size = strlen (msg) + 1; /* Includes the null. */
565 if (BE (errbuf_size != 0, 1))
567 size_t cpy_size = msg_size;
568 if (BE (msg_size > errbuf_size, 0))
570 cpy_size = errbuf_size - 1;
571 errbuf[cpy_size] = '\0';
573 memcpy (errbuf, msg, cpy_size);
579 weak_alias (__regerror, regerror)
583 #ifdef RE_ENABLE_I18N
584 /* This static array is used for the map to single-byte characters when
585 UTF-8 is used. Otherwise we would allocate memory just to initialize
586 it the same all the time. UTF-8 is the preferred encoding so this is
587 a worthwhile optimization. */
588 static const bitset_t utf8_sb_map =
590 /* Set the first 128 bits. */
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 # error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
609 free_dfa_content (re_dfa_t *dfa)
614 for (i = 0; i < dfa->nodes_len; ++i)
615 free_token (dfa->nodes + i);
616 re_free (dfa->nexts);
617 for (i = 0; i < dfa->nodes_len; ++i)
619 if (dfa->eclosures != NULL)
620 re_node_set_free (dfa->eclosures + i);
621 if (dfa->inveclosures != NULL)
622 re_node_set_free (dfa->inveclosures + i);
623 if (dfa->edests != NULL)
624 re_node_set_free (dfa->edests + i);
626 re_free (dfa->edests);
627 re_free (dfa->eclosures);
628 re_free (dfa->inveclosures);
629 re_free (dfa->nodes);
631 if (dfa->state_table)
632 for (i = 0; i <= dfa->state_hash_mask; ++i)
634 struct re_state_table_entry *entry = dfa->state_table + i;
635 for (j = 0; j < entry->num; ++j)
637 re_dfastate_t *state = entry->array[j];
640 re_free (entry->array);
642 re_free (dfa->state_table);
643 #ifdef RE_ENABLE_I18N
644 if (dfa->sb_char != utf8_sb_map)
645 re_free (dfa->sb_char);
647 re_free (dfa->subexp_map);
649 re_free (dfa->re_str);
656 /* Free dynamically allocated space used by PREG. */
662 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
663 if (BE (dfa != NULL, 1))
664 free_dfa_content (dfa);
668 re_free (preg->fastmap);
669 preg->fastmap = NULL;
671 re_free (preg->translate);
672 preg->translate = NULL;
675 weak_alias (__regfree, regfree)
678 /* Entry points compatible with 4.2 BSD regex library. We don't define
679 them unless specifically requested. */
681 #if defined _REGEX_RE_COMP || defined _LIBC
683 /* BSD has one and only one pattern buffer. */
684 static struct re_pattern_buffer re_comp_buf;
688 /* Make these definitions weak in libc, so POSIX programs can redefine
689 these names if they don't use our functions, and still use
690 regcomp/regexec above without link errors. */
701 if (!re_comp_buf.buffer)
702 return gettext ("No previous regular expression");
706 if (re_comp_buf.buffer)
708 fastmap = re_comp_buf.fastmap;
709 re_comp_buf.fastmap = NULL;
710 __regfree (&re_comp_buf);
711 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
712 re_comp_buf.fastmap = fastmap;
715 if (re_comp_buf.fastmap == NULL)
717 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
718 if (re_comp_buf.fastmap == NULL)
719 return (char *) gettext (__re_error_msgid
720 + __re_error_msgid_idx[(int) REG_ESPACE]);
723 /* Since `re_exec' always passes NULL for the `regs' argument, we
724 don't need to initialize the pattern buffer fields which affect it. */
726 /* Match anchors at newlines. */
727 re_comp_buf.newline_anchor = 1;
729 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
734 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
735 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
739 libc_freeres_fn (free_mem)
741 __regfree (&re_comp_buf);
745 #endif /* _REGEX_RE_COMP */
747 /* Internal entry point.
748 Compile the regular expression PATTERN, whose length is LENGTH.
749 SYNTAX indicate regular expression's syntax. */
752 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
755 reg_errcode_t err = REG_NOERROR;
759 /* Initialize the pattern buffer. */
760 preg->fastmap_accurate = 0;
761 preg->syntax = syntax;
762 preg->not_bol = preg->not_eol = 0;
765 preg->can_be_null = 0;
766 preg->regs_allocated = REGS_UNALLOCATED;
768 /* Initialize the dfa. */
769 dfa = (re_dfa_t *) preg->buffer;
770 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772 /* If zero allocated, but buffer is non-null, try to realloc
773 enough space. This loses if buffer's address is bogus, but
774 that is the user's responsibility. If ->buffer is NULL this
775 is a simple allocation. */
776 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
779 preg->allocated = sizeof (re_dfa_t);
780 preg->buffer = (unsigned char *) dfa;
782 preg->used = sizeof (re_dfa_t);
784 err = init_dfa (dfa, length);
785 if (BE (err != REG_NOERROR, 0))
787 free_dfa_content (dfa);
793 /* Note: length+1 will not overflow since it is checked in init_dfa. */
794 dfa->re_str = re_malloc (char, length + 1);
795 strncpy (dfa->re_str, pattern, length + 1);
798 __libc_lock_init (dfa->lock);
800 err = re_string_construct (®exp, pattern, length, preg->translate,
801 (syntax & RE_ICASE) != 0, dfa);
802 if (BE (err != REG_NOERROR, 0))
804 re_compile_internal_free_return:
805 free_workarea_compile (preg);
806 re_string_destruct (®exp);
807 free_dfa_content (dfa);
813 /* Parse the regular expression, and build a structure tree. */
815 dfa->str_tree = parse (®exp, preg, syntax, &err);
816 if (BE (dfa->str_tree == NULL, 0))
817 goto re_compile_internal_free_return;
819 /* Analyze the tree and create the nfa. */
820 err = analyze (preg);
821 if (BE (err != REG_NOERROR, 0))
822 goto re_compile_internal_free_return;
824 #ifdef RE_ENABLE_I18N
825 /* If possible, do searching in single byte encoding to speed things up. */
826 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
830 /* Then create the initial state of the dfa. */
831 err = create_initial_state (dfa);
833 /* Release work areas. */
834 free_workarea_compile (preg);
835 re_string_destruct (®exp);
837 if (BE (err != REG_NOERROR, 0))
839 free_dfa_content (dfa);
847 /* Initialize DFA. We use the length of the regular expression PAT_LEN
848 as the initial length of some arrays. */
851 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 __re_size_t table_size;
857 #ifdef RE_ENABLE_I18N
858 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
860 size_t max_i18n_object_size = 0;
862 size_t max_object_size =
863 MAX (sizeof (struct re_state_table_entry),
864 MAX (sizeof (re_token_t),
865 MAX (sizeof (re_node_set),
866 MAX (sizeof (regmatch_t),
867 max_i18n_object_size))));
869 memset (dfa, '\0', sizeof (re_dfa_t));
871 /* Force allocation of str_tree_storage the first time. */
872 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
875 calculation below, and for similar doubling calculations
876 elsewhere. And it's <= rather than <, because some of the
877 doubling calculations add 1 afterwards. */
878 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
881 dfa->nodes_alloc = pat_len + 1;
882 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
884 /* table_size = 2 ^ ceil(log pat_len) */
885 for (table_size = 1; ; table_size <<= 1)
886 if (table_size > pat_len)
889 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
890 dfa->state_hash_mask = table_size - 1;
892 dfa->mb_cur_max = MB_CUR_MAX;
894 if (dfa->mb_cur_max == 6
895 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
897 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
900 codeset_name = nl_langinfo (CODESET);
901 if (strcasecmp (codeset_name, "UTF-8") == 0
902 || strcasecmp (codeset_name, "UTF8") == 0)
905 /* We check exhaustively in the loop below if this charset is a
906 superset of ASCII. */
907 dfa->map_notascii = 0;
910 #ifdef RE_ENABLE_I18N
911 if (dfa->mb_cur_max > 1)
914 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
919 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
920 if (BE (dfa->sb_char == NULL, 0))
923 /* Set the bits corresponding to single byte chars. */
924 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
925 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
927 wint_t wch = __btowc (ch);
929 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
931 if (isascii (ch) && wch != ch)
932 dfa->map_notascii = 1;
939 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
944 /* Initialize WORD_CHAR table, which indicate which character is
945 "word". In this case "word" means that it is the word construction
946 character used by some operators like "\<", "\>", etc. */
950 init_word_char (re_dfa_t *dfa)
953 dfa->word_ops_used = 1;
954 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
955 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
956 if (isalnum (ch) || ch == '_')
957 dfa->word_char[i] |= (bitset_word_t) 1 << j;
960 /* Free the work area which are only used while compiling. */
963 free_workarea_compile (regex_t *preg)
965 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
966 bin_tree_storage_t *storage, *next;
967 for (storage = dfa->str_tree_storage; storage; storage = next)
969 next = storage->next;
972 dfa->str_tree_storage = NULL;
973 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
974 dfa->str_tree = NULL;
975 re_free (dfa->org_indices);
976 dfa->org_indices = NULL;
979 /* Create initial states for all contexts. */
982 create_initial_state (re_dfa_t *dfa)
986 re_node_set init_nodes;
988 /* Initial states have the epsilon closure of the node which is
989 the first node of the regular expression. */
990 first = dfa->str_tree->first->node_idx;
991 dfa->init_node = first;
992 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
993 if (BE (err != REG_NOERROR, 0))
996 /* The back-references which are in initial states can epsilon transit,
997 since in this case all of the subexpressions can be null.
998 Then we add epsilon closures of the nodes which are the next nodes of
999 the back-references. */
1000 if (dfa->nbackref > 0)
1001 for (i = 0; i < init_nodes.nelem; ++i)
1003 Idx node_idx = init_nodes.elems[i];
1004 re_token_type_t type = dfa->nodes[node_idx].type;
1007 if (type != OP_BACK_REF)
1009 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011 re_token_t *clexp_node;
1012 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1013 if (clexp_node->type == OP_CLOSE_SUBEXP
1014 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1017 if (clexp_idx == init_nodes.nelem)
1020 if (type == OP_BACK_REF)
1022 Idx dest_idx = dfa->edests[node_idx].elems[0];
1023 if (!re_node_set_contains (&init_nodes, dest_idx))
1025 reg_errcode_t merge_err
1026 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027 if (merge_err != REG_NOERROR)
1034 /* It must be the first time to invoke acquire_state. */
1035 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1036 /* We don't check ERR here, since the initial state must not be NULL. */
1037 if (BE (dfa->init_state == NULL, 0))
1039 if (dfa->init_state->has_constraint)
1041 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1049 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1050 || dfa->init_state_begbuf == NULL, 0))
1054 dfa->init_state_word = dfa->init_state_nl
1055 = dfa->init_state_begbuf = dfa->init_state;
1057 re_node_set_free (&init_nodes);
1061 #ifdef RE_ENABLE_I18N
1062 /* If it is possible to do searching in single byte encoding instead of UTF-8
1063 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1064 DFA nodes where needed. */
1067 optimize_utf8 (re_dfa_t *dfa)
1071 bool mb_chars = false;
1072 bool has_period = false;
1074 for (node = 0; node < dfa->nodes_len; ++node)
1075 switch (dfa->nodes[node].type)
1078 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1082 switch (dfa->nodes[node].opr.ctx_type)
1090 /* Word anchors etc. cannot be handled. It's okay to test
1091 opr.ctx_type since constraints (for all DFA nodes) are
1092 created by ORing one or more opr.ctx_type values. */
1102 case OP_DUP_ASTERISK:
1103 case OP_OPEN_SUBEXP:
1104 case OP_CLOSE_SUBEXP:
1106 case COMPLEX_BRACKET:
1108 case SIMPLE_BRACKET:
1109 /* Just double check. */
1111 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1114 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1126 if (mb_chars || has_period)
1127 for (node = 0; node < dfa->nodes_len; ++node)
1129 if (dfa->nodes[node].type == CHARACTER
1130 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1131 dfa->nodes[node].mb_partial = 0;
1132 else if (dfa->nodes[node].type == OP_PERIOD)
1133 dfa->nodes[node].type = OP_UTF8_PERIOD;
1136 /* The search can be in single byte locale. */
1137 dfa->mb_cur_max = 1;
1139 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1143 /* Analyze the structure tree, and calculate "first", "next", "edest",
1144 "eclosure", and "inveclosure". */
1146 static reg_errcode_t
1147 analyze (regex_t *preg)
1149 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1152 /* Allocate arrays. */
1153 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1154 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1155 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1156 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1157 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1158 || dfa->eclosures == NULL, 0))
1161 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1162 if (dfa->subexp_map != NULL)
1165 for (i = 0; i < preg->re_nsub; i++)
1166 dfa->subexp_map[i] = i;
1167 preorder (dfa->str_tree, optimize_subexps, dfa);
1168 for (i = 0; i < preg->re_nsub; i++)
1169 if (dfa->subexp_map[i] != i)
1171 if (i == preg->re_nsub)
1173 free (dfa->subexp_map);
1174 dfa->subexp_map = NULL;
1178 ret = postorder (dfa->str_tree, lower_subexps, preg);
1179 if (BE (ret != REG_NOERROR, 0))
1181 ret = postorder (dfa->str_tree, calc_first, dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1184 preorder (dfa->str_tree, calc_next, dfa);
1185 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1188 ret = calc_eclosure (dfa);
1189 if (BE (ret != REG_NOERROR, 0))
1192 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1193 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1194 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1197 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1198 if (BE (dfa->inveclosures == NULL, 0))
1200 ret = calc_inveclosure (dfa);
1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
1207 implement parse tree visits. Instead, we use parent pointers and
1208 some hairy code in these two functions. */
1209 static reg_errcode_t
1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1213 bin_tree_t *node, *prev;
1215 for (node = root; ; )
1217 /* Descend down the tree, preferably to the left (or to the right
1218 if that's the only child). */
1219 while (node->left || node->right)
1227 reg_errcode_t err = fn (extra, node);
1228 if (BE (err != REG_NOERROR, 0))
1230 if (node->parent == NULL)
1233 node = node->parent;
1235 /* Go up while we have a node that is reached from the right. */
1236 while (node->right == prev || node->right == NULL);
1241 static reg_errcode_t
1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1247 for (node = root; ; )
1249 reg_errcode_t err = fn (extra, node);
1250 if (BE (err != REG_NOERROR, 0))
1253 /* Go to the left node, or up and to the right. */
1258 bin_tree_t *prev = NULL;
1259 while (node->right == prev || node->right == NULL)
1262 node = node->parent;
1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1272 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1273 backreferences as well. Requires a preorder visit. */
1274 static reg_errcode_t
1275 optimize_subexps (void *extra, bin_tree_t *node)
1277 re_dfa_t *dfa = (re_dfa_t *) extra;
1279 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281 int idx = node->token.opr.idx;
1282 node->token.opr.idx = dfa->subexp_map[idx];
1283 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1286 else if (node->token.type == SUBEXP
1287 && node->left && node->left->token.type == SUBEXP)
1289 Idx other_idx = node->left->token.opr.idx;
1291 node->left = node->left->left;
1293 node->left->parent = node;
1295 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1296 if (other_idx < BITSET_WORD_BITS)
1297 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1304 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1305 static reg_errcode_t
1306 lower_subexps (void *extra, bin_tree_t *node)
1308 regex_t *preg = (regex_t *) extra;
1309 reg_errcode_t err = REG_NOERROR;
1311 if (node->left && node->left->token.type == SUBEXP)
1313 node->left = lower_subexp (&err, preg, node->left);
1315 node->left->parent = node;
1317 if (node->right && node->right->token.type == SUBEXP)
1319 node->right = lower_subexp (&err, preg, node->right);
1321 node->right->parent = node;
1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331 bin_tree_t *body = node->left;
1332 bin_tree_t *op, *cls, *tree1, *tree;
1335 /* We do not optimize empty subexpressions, because otherwise we may
1336 have bad CONCAT nodes with NULL children. This is obviously not
1337 very common, so we do not lose much. An example that triggers
1338 this case is the sed "script" /\(\)/x. */
1339 && node->left != NULL
1340 && (node->token.opr.idx >= BITSET_WORD_BITS
1341 || !(dfa->used_bkref_map
1342 & ((bitset_word_t) 1 << node->token.opr.idx))))
1345 /* Convert the SUBEXP node to the concatenation of an
1346 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1347 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1348 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1349 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1350 tree = create_tree (dfa, op, tree1, CONCAT);
1351 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1357 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1358 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1363 nodes. Requires a postorder visit. */
1364 static reg_errcode_t
1365 calc_first (void *extra, bin_tree_t *node)
1367 re_dfa_t *dfa = (re_dfa_t *) extra;
1368 if (node->token.type == CONCAT)
1370 node->first = node->left->first;
1371 node->node_idx = node->left->node_idx;
1376 node->node_idx = re_dfa_add_node (dfa, node->token);
1377 if (BE (node->node_idx == REG_MISSING, 0))
1379 if (node->token.type == ANCHOR)
1380 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1385 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1386 static reg_errcode_t
1387 calc_next (void *extra, bin_tree_t *node)
1389 switch (node->token.type)
1391 case OP_DUP_ASTERISK:
1392 node->left->next = node;
1395 node->left->next = node->right->first;
1396 node->right->next = node->next;
1400 node->left->next = node->next;
1402 node->right->next = node->next;
1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1409 static reg_errcode_t
1410 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 re_dfa_t *dfa = (re_dfa_t *) extra;
1413 Idx idx = node->node_idx;
1414 reg_errcode_t err = REG_NOERROR;
1416 switch (node->token.type)
1422 assert (node->next == NULL);
1425 case OP_DUP_ASTERISK:
1429 dfa->has_plural_match = 1;
1430 if (node->left != NULL)
1431 left = node->left->first->node_idx;
1433 left = node->next->node_idx;
1434 if (node->right != NULL)
1435 right = node->right->first->node_idx;
1437 right = node->next->node_idx;
1438 assert (REG_VALID_INDEX (left));
1439 assert (REG_VALID_INDEX (right));
1440 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1445 case OP_OPEN_SUBEXP:
1446 case OP_CLOSE_SUBEXP:
1447 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1451 dfa->nexts[idx] = node->next->node_idx;
1452 if (node->token.type == OP_BACK_REF)
1453 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1457 assert (!IS_EPSILON_NODE (node->token.type));
1458 dfa->nexts[idx] = node->next->node_idx;
1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
1466 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1467 to their own constraint. */
1469 static reg_errcode_t
1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1472 Idx root_node, unsigned int init_constraint)
1474 Idx org_node, clone_node;
1476 unsigned int constraint = init_constraint;
1477 for (org_node = top_org_node, clone_node = top_clone_node;;)
1479 Idx org_dest, clone_dest;
1480 if (dfa->nodes[org_node].type == OP_BACK_REF)
1482 /* If the back reference epsilon-transit, its destination must
1483 also have the constraint. Then duplicate the epsilon closure
1484 of the destination of the back reference, and store it in
1485 edests of the back reference. */
1486 org_dest = dfa->nexts[org_node];
1487 re_node_set_empty (dfa->edests + clone_node);
1488 clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 if (BE (clone_dest == REG_MISSING, 0))
1491 dfa->nexts[clone_node] = dfa->nexts[org_node];
1492 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 else if (dfa->edests[org_node].nelem == 0)
1498 /* In case of the node can't epsilon-transit, don't duplicate the
1499 destination and store the original destination as the
1500 destination of the node. */
1501 dfa->nexts[clone_node] = dfa->nexts[org_node];
1504 else if (dfa->edests[org_node].nelem == 1)
1506 /* In case of the node can epsilon-transit, and it has only one
1508 org_dest = dfa->edests[org_node].elems[0];
1509 re_node_set_empty (dfa->edests + clone_node);
1510 /* If the node is root_node itself, it means the epsilon closure
1511 has a loop. Then tie it to the destination of the root_node. */
1512 if (org_node == root_node && clone_node != org_node)
1514 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1519 /* In case the node has another constraint, append it. */
1520 constraint |= dfa->nodes[org_node].constraint;
1521 clone_dest = duplicate_node (dfa, org_dest, constraint);
1522 if (BE (clone_dest == REG_MISSING, 0))
1524 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 else /* dfa->edests[org_node].nelem == 2 */
1530 /* In case of the node can epsilon-transit, and it has two
1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1532 org_dest = dfa->edests[org_node].elems[0];
1533 re_node_set_empty (dfa->edests + clone_node);
1534 /* Search for a duplicated node which satisfies the constraint. */
1535 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536 if (clone_dest == REG_MISSING)
1538 /* There is no such duplicated node, create a new one. */
1540 clone_dest = duplicate_node (dfa, org_dest, constraint);
1541 if (BE (clone_dest == REG_MISSING, 0))
1543 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547 root_node, constraint);
1548 if (BE (err != REG_NOERROR, 0))
1553 /* There is a duplicated node which satisfies the constraint,
1554 use it to avoid infinite loop. */
1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 org_dest = dfa->edests[org_node].elems[1];
1561 clone_dest = duplicate_node (dfa, org_dest, constraint);
1562 if (BE (clone_dest == REG_MISSING, 0))
1564 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 org_node = org_dest;
1569 clone_node = clone_dest;
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575 satisfies the constraint CONSTRAINT. */
1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1579 unsigned int constraint)
1582 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584 if (org_node == dfa->org_indices[idx]
1585 && constraint == dfa->nodes[idx].constraint)
1586 return idx; /* Found. */
1588 return REG_MISSING; /* Not found. */
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592 Return the index of the new node, or REG_MISSING if insufficient storage is
1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599 if (BE (dup_idx != REG_MISSING, 1))
1601 dfa->nodes[dup_idx].constraint = constraint;
1602 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603 dfa->nodes[dup_idx].duplicated = 1;
1605 /* Store the index of the original node. */
1606 dfa->org_indices[dup_idx] = org_idx;
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1616 for (idx = 0; idx < dfa->nodes_len; ++idx)
1617 re_node_set_init_empty (dfa->inveclosures + idx);
1619 for (src = 0; src < dfa->nodes_len; ++src)
1621 Idx *elems = dfa->eclosures[src].elems;
1622 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1633 /* Calculate "eclosure" for all the node in DFA. */
1635 static reg_errcode_t
1636 calc_eclosure (re_dfa_t *dfa)
1641 assert (dfa->nodes_len > 0);
1644 /* For each nodes, calculate epsilon closure. */
1645 for (node_idx = 0; ; ++node_idx)
1648 re_node_set eclosure_elem;
1649 if (node_idx == dfa->nodes_len)
1658 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1661 /* If we have already calculated, skip it. */
1662 if (dfa->eclosures[node_idx].nelem != 0)
1664 /* Calculate epsilon closure of `node_idx'. */
1665 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666 if (BE (err != REG_NOERROR, 0))
1669 if (dfa->eclosures[node_idx].nelem == 0)
1672 re_node_set_free (&eclosure_elem);
1678 /* Calculate epsilon closure of NODE. */
1680 static reg_errcode_t
1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1685 re_node_set eclosure;
1687 bool incomplete = false;
1688 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689 if (BE (err != REG_NOERROR, 0))
1692 /* This indicates that we are calculating this node now.
1693 We reference this value to avoid infinite loop. */
1694 dfa->eclosures[node].nelem = REG_MISSING;
1696 /* If the current node has constraints, duplicate all nodes
1697 since they must inherit the constraints. */
1698 if (dfa->nodes[node].constraint
1699 && dfa->edests[node].nelem
1700 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702 err = duplicate_node_closure (dfa, node, node, node,
1703 dfa->nodes[node].constraint);
1704 if (BE (err != REG_NOERROR, 0))
1708 /* Expand each epsilon destination nodes. */
1709 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710 for (i = 0; i < dfa->edests[node].nelem; ++i)
1712 re_node_set eclosure_elem;
1713 Idx edest = dfa->edests[node].elems[i];
1714 /* If calculating the epsilon closure of `edest' is in progress,
1715 return intermediate result. */
1716 if (dfa->eclosures[edest].nelem == REG_MISSING)
1721 /* If we haven't calculated the epsilon closure of `edest' yet,
1722 calculate now. Otherwise use calculated epsilon closure. */
1723 if (dfa->eclosures[edest].nelem == 0)
1725 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1726 if (BE (err != REG_NOERROR, 0))
1730 eclosure_elem = dfa->eclosures[edest];
1731 /* Merge the epsilon closure of `edest'. */
1732 err = re_node_set_merge (&eclosure, &eclosure_elem);
1733 if (BE (err != REG_NOERROR, 0))
1735 /* If the epsilon closure of `edest' is incomplete,
1736 the epsilon closure of this node is also incomplete. */
1737 if (dfa->eclosures[edest].nelem == 0)
1740 re_node_set_free (&eclosure_elem);
1744 /* An epsilon closure includes itself. */
1745 ok = re_node_set_insert (&eclosure, node);
1748 if (incomplete && !root)
1749 dfa->eclosures[node].nelem = 0;
1751 dfa->eclosures[node] = eclosure;
1752 *new_set = eclosure;
1756 /* Functions for token which are used in the parser. */
1758 /* Fetch a token from INPUT.
1759 We must not use this function inside bracket expressions. */
1763 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765 re_string_skip_bytes (input, peek_token (result, input, syntax));
1768 /* Peek a token from INPUT, and return the length of the token.
1769 We must not use this function inside bracket expressions. */
1773 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1777 if (re_string_eoi (input))
1779 token->type = END_OF_RE;
1783 c = re_string_peek_byte (input, 0);
1786 token->word_char = 0;
1787 #ifdef RE_ENABLE_I18N
1788 token->mb_partial = 0;
1789 if (input->mb_cur_max > 1 &&
1790 !re_string_first_byte (input, re_string_cur_idx (input)))
1792 token->type = CHARACTER;
1793 token->mb_partial = 1;
1800 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802 token->type = BACK_SLASH;
1806 c2 = re_string_peek_byte_case (input, 1);
1808 token->type = CHARACTER;
1809 #ifdef RE_ENABLE_I18N
1810 if (input->mb_cur_max > 1)
1812 wint_t wc = re_string_wchar_at (input,
1813 re_string_cur_idx (input) + 1);
1814 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1818 token->word_char = IS_WORD_CHAR (c2) != 0;
1823 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824 token->type = OP_ALT;
1826 case '1': case '2': case '3': case '4': case '5':
1827 case '6': case '7': case '8': case '9':
1828 if (!(syntax & RE_NO_BK_REFS))
1830 token->type = OP_BACK_REF;
1831 token->opr.idx = c2 - '1';
1835 if (!(syntax & RE_NO_GNU_OPS))
1837 token->type = ANCHOR;
1838 token->opr.ctx_type = WORD_FIRST;
1842 if (!(syntax & RE_NO_GNU_OPS))
1844 token->type = ANCHOR;
1845 token->opr.ctx_type = WORD_LAST;
1849 if (!(syntax & RE_NO_GNU_OPS))
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = WORD_DELIM;
1856 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = NOT_WORD_DELIM;
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_WORD;
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_NOTWORD;
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_SPACE;
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_NOTSPACE;
1879 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = BUF_FIRST;
1886 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = ANCHOR;
1889 token->opr.ctx_type = BUF_LAST;
1893 if (!(syntax & RE_NO_BK_PARENS))
1894 token->type = OP_OPEN_SUBEXP;
1897 if (!(syntax & RE_NO_BK_PARENS))
1898 token->type = OP_CLOSE_SUBEXP;
1901 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902 token->type = OP_DUP_PLUS;
1905 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906 token->type = OP_DUP_QUESTION;
1909 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910 token->type = OP_OPEN_DUP_NUM;
1913 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914 token->type = OP_CLOSE_DUP_NUM;
1922 token->type = CHARACTER;
1923 #ifdef RE_ENABLE_I18N
1924 if (input->mb_cur_max > 1)
1926 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1931 token->word_char = IS_WORD_CHAR (token->opr.c);
1936 if (syntax & RE_NEWLINE_ALT)
1937 token->type = OP_ALT;
1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941 token->type = OP_ALT;
1944 token->type = OP_DUP_ASTERISK;
1947 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948 token->type = OP_DUP_PLUS;
1951 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952 token->type = OP_DUP_QUESTION;
1955 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956 token->type = OP_OPEN_DUP_NUM;
1959 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960 token->type = OP_CLOSE_DUP_NUM;
1963 if (syntax & RE_NO_BK_PARENS)
1964 token->type = OP_OPEN_SUBEXP;
1967 if (syntax & RE_NO_BK_PARENS)
1968 token->type = OP_CLOSE_SUBEXP;
1971 token->type = OP_OPEN_BRACKET;
1974 token->type = OP_PERIOD;
1977 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978 re_string_cur_idx (input) != 0)
1980 char prev = re_string_peek_byte (input, -1);
1981 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1984 token->type = ANCHOR;
1985 token->opr.ctx_type = LINE_FIRST;
1988 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989 re_string_cur_idx (input) + 1 != re_string_length (input))
1992 re_string_skip_bytes (input, 1);
1993 peek_token (&next, input, syntax);
1994 re_string_skip_bytes (input, -1);
1995 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998 token->type = ANCHOR;
1999 token->opr.ctx_type = LINE_LAST;
2007 /* Peek a token from INPUT, and return the length of the token.
2008 We must not use this function out of bracket expressions. */
2012 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2015 if (re_string_eoi (input))
2017 token->type = END_OF_RE;
2020 c = re_string_peek_byte (input, 0);
2023 #ifdef RE_ENABLE_I18N
2024 if (input->mb_cur_max > 1 &&
2025 !re_string_first_byte (input, re_string_cur_idx (input)))
2027 token->type = CHARACTER;
2030 #endif /* RE_ENABLE_I18N */
2032 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033 && re_string_cur_idx (input) + 1 < re_string_length (input))
2035 /* In this case, '\' escape a character. */
2037 re_string_skip_bytes (input, 1);
2038 c2 = re_string_peek_byte (input, 0);
2040 token->type = CHARACTER;
2043 if (c == '[') /* '[' is a special char in a bracket exps. */
2047 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048 c2 = re_string_peek_byte (input, 1);
2056 token->type = OP_OPEN_COLL_ELEM;
2059 token->type = OP_OPEN_EQUIV_CLASS;
2062 if (syntax & RE_CHAR_CLASSES)
2064 token->type = OP_OPEN_CHAR_CLASS;
2067 /* else fall through. */
2069 token->type = CHARACTER;
2079 token->type = OP_CHARSET_RANGE;
2082 token->type = OP_CLOSE_BRACKET;
2085 token->type = OP_NON_MATCH_LIST;
2088 token->type = CHARACTER;
2093 /* Functions for parser. */
2095 /* Entry point of the parser.
2096 Parse the regular expression REGEXP and return the structure tree.
2097 If an error is occured, ERR is set by error code, and return NULL.
2098 This function build the following tree, from regular expression <reg_exp>:
2104 CAT means concatenation.
2105 EOR means end of regular expression. */
2108 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2111 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112 bin_tree_t *tree, *eor, *root;
2113 re_token_t current_token;
2114 dfa->syntax = syntax;
2115 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2117 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121 root = create_tree (dfa, tree, eor, CONCAT);
2124 if (BE (eor == NULL || root == NULL, 0))
2132 /* This function build the following tree, from regular expression
2133 <branch1>|<branch2>:
2139 ALT means alternative, which represents the operator `|'. */
2142 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2145 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146 bin_tree_t *tree, *branch = NULL;
2147 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2151 while (token->type == OP_ALT)
2153 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154 if (token->type != OP_ALT && token->type != END_OF_RE
2155 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2163 tree = create_tree (dfa, tree, branch, OP_ALT);
2164 if (BE (tree == NULL, 0))
2173 /* This function build the following tree, from regular expression
2180 CAT means concatenation. */
2183 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2186 bin_tree_t *tree, *expr;
2187 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192 while (token->type != OP_ALT && token->type != END_OF_RE
2193 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2196 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2200 if (tree != NULL && expr != NULL)
2202 tree = create_tree (dfa, tree, expr, CONCAT);
2209 else if (tree == NULL)
2211 /* Otherwise expr == NULL, we don't need to create new tree. */
2216 /* This function build the following tree, from regular expression a*:
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228 switch (token->type)
2231 tree = create_token_tree (dfa, NULL, NULL, token);
2232 if (BE (tree == NULL, 0))
2237 #ifdef RE_ENABLE_I18N
2238 if (dfa->mb_cur_max > 1)
2240 while (!re_string_eoi (regexp)
2241 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243 bin_tree_t *mbc_remain;
2244 fetch_token (token, regexp, syntax);
2245 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247 if (BE (mbc_remain == NULL || tree == NULL, 0))
2256 case OP_OPEN_SUBEXP:
2257 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2261 case OP_OPEN_BRACKET:
2262 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2272 dfa->used_bkref_map |= 1 << token->opr.idx;
2273 tree = create_token_tree (dfa, NULL, NULL, token);
2274 if (BE (tree == NULL, 0))
2280 dfa->has_mb_node = 1;
2282 case OP_OPEN_DUP_NUM:
2283 if (syntax & RE_CONTEXT_INVALID_DUP)
2289 case OP_DUP_ASTERISK:
2291 case OP_DUP_QUESTION:
2292 if (syntax & RE_CONTEXT_INVALID_OPS)
2297 else if (syntax & RE_CONTEXT_INDEP_OPS)
2299 fetch_token (token, regexp, syntax);
2300 return parse_expression (regexp, preg, token, syntax, nest, err);
2302 /* else fall through */
2303 case OP_CLOSE_SUBEXP:
2304 if ((token->type == OP_CLOSE_SUBEXP) &&
2305 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2310 /* else fall through */
2311 case OP_CLOSE_DUP_NUM:
2312 /* We treat it as a normal character. */
2314 /* Then we can these characters as normal characters. */
2315 token->type = CHARACTER;
2316 /* mb_partial and word_char bits should be initialized already
2318 tree = create_token_tree (dfa, NULL, NULL, token);
2319 if (BE (tree == NULL, 0))
2326 if ((token->opr.ctx_type
2327 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328 && dfa->word_ops_used == 0)
2329 init_word_char (dfa);
2330 if (token->opr.ctx_type == WORD_DELIM
2331 || token->opr.ctx_type == NOT_WORD_DELIM)
2333 bin_tree_t *tree_first, *tree_last;
2334 if (token->opr.ctx_type == WORD_DELIM)
2336 token->opr.ctx_type = WORD_FIRST;
2337 tree_first = create_token_tree (dfa, NULL, NULL, token);
2338 token->opr.ctx_type = WORD_LAST;
2342 token->opr.ctx_type = INSIDE_WORD;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = INSIDE_NOTWORD;
2346 tree_last = create_token_tree (dfa, NULL, NULL, token);
2347 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2356 tree = create_token_tree (dfa, NULL, NULL, token);
2357 if (BE (tree == NULL, 0))
2363 /* We must return here, since ANCHORs can't be followed
2364 by repetition operators.
2365 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2367 fetch_token (token, regexp, syntax);
2370 tree = create_token_tree (dfa, NULL, NULL, token);
2371 if (BE (tree == NULL, 0))
2376 if (dfa->mb_cur_max > 1)
2377 dfa->has_mb_node = 1;
2381 tree = build_charclass_op (dfa, regexp->trans,
2382 (const unsigned char *) "alnum",
2383 (const unsigned char *) "_",
2384 token->type == OP_NOTWORD, err);
2385 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2390 tree = build_charclass_op (dfa, regexp->trans,
2391 (const unsigned char *) "space",
2392 (const unsigned char *) "",
2393 token->type == OP_NOTSPACE, err);
2394 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2404 /* Must not happen? */
2410 fetch_token (token, regexp, syntax);
2412 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 /* In BRE consecutive duplications are not allowed. */
2419 if ((syntax & RE_CONTEXT_INVALID_DUP)
2420 && (token->type == OP_DUP_ASTERISK
2421 || token->type == OP_OPEN_DUP_NUM))
2431 /* This function build the following tree, from regular expression
2439 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2442 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2445 cur_nsub = preg->re_nsub++;
2447 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449 /* The subexpression may be a null string. */
2450 if (token->type == OP_CLOSE_SUBEXP)
2454 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457 if (BE (*err != REG_NOERROR, 0))
2461 if (cur_nsub <= '9' - '1')
2462 dfa->completed_bkref_map |= 1 << cur_nsub;
2464 tree = create_tree (dfa, tree, NULL, SUBEXP);
2465 if (BE (tree == NULL, 0))
2470 tree->token.opr.idx = cur_nsub;
2474 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2477 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480 bin_tree_t *tree = NULL, *old_tree = NULL;
2481 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2482 re_token_t start_token = *token;
2484 if (token->type == OP_OPEN_DUP_NUM)
2487 start = fetch_number (regexp, token, syntax);
2488 if (start == REG_MISSING)
2490 if (token->type == CHARACTER && token->opr.c == ',')
2491 start = 0; /* We treat "{,m}" as "{0,m}". */
2494 *err = REG_BADBR; /* <re>{} is invalid. */
2498 if (BE (start != REG_ERROR, 1))
2500 /* We treat "{n}" as "{n,n}". */
2501 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2502 : ((token->type == CHARACTER && token->opr.c == ',')
2503 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2505 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2507 /* Invalid sequence. */
2508 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510 if (token->type == END_OF_RE)
2518 /* If the syntax bit is set, rollback. */
2519 re_string_set_index (regexp, start_idx);
2520 *token = start_token;
2521 token->type = CHARACTER;
2522 /* mb_partial and word_char bits should be already initialized by
2527 if (BE ((end != REG_MISSING && start > end)
2528 || token->type != OP_CLOSE_DUP_NUM, 0))
2530 /* First number greater than second. */
2537 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2541 fetch_token (token, regexp, syntax);
2543 if (BE (elem == NULL, 0))
2545 if (BE (start == 0 && end == 0, 0))
2547 postorder (elem, free_tree, NULL);
2551 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2552 if (BE (start > 0, 0))
2555 for (i = 2; i <= start; ++i)
2557 elem = duplicate_tree (elem, dfa);
2558 tree = create_tree (dfa, tree, elem, CONCAT);
2559 if (BE (elem == NULL || tree == NULL, 0))
2560 goto parse_dup_op_espace;
2566 /* Duplicate ELEM before it is marked optional. */
2567 elem = duplicate_tree (elem, dfa);
2573 if (elem->token.type == SUBEXP)
2574 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576 tree = create_tree (dfa, elem, NULL,
2577 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2578 if (BE (tree == NULL, 0))
2579 goto parse_dup_op_espace;
2581 /* From gnulib's "intprops.h":
2582 True if the arithmetic type T is signed. */
2583 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2585 /* This loop is actually executed only when end != REG_MISSING,
2586 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2587 already created the start+1-th copy. */
2588 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2589 for (i = start + 2; i <= end; ++i)
2591 elem = duplicate_tree (elem, dfa);
2592 tree = create_tree (dfa, tree, elem, CONCAT);
2593 if (BE (elem == NULL || tree == NULL, 0))
2594 goto parse_dup_op_espace;
2596 tree = create_tree (dfa, tree, NULL, OP_ALT);
2597 if (BE (tree == NULL, 0))
2598 goto parse_dup_op_espace;
2602 tree = create_tree (dfa, old_tree, tree, CONCAT);
2606 parse_dup_op_espace:
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612 I'm not sure, but maybe enough. */
2613 #define BRACKET_NAME_BUF_SIZE 32
2616 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617 Build the range expression which starts from START_ELEM, and ends
2618 at END_ELEM. The result are written to MBCSET and SBCSET.
2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620 mbcset->range_ends, is a pointer argument sinse we may
2623 static reg_errcode_t
2625 # ifdef RE_ENABLE_I18N
2626 build_range_exp (const reg_syntax_t syntax,
2628 re_charset_t *mbcset,
2630 const bracket_elem_t *start_elem,
2631 const bracket_elem_t *end_elem)
2632 # else /* not RE_ENABLE_I18N */
2633 build_range_exp (const reg_syntax_t syntax,
2635 const bracket_elem_t *start_elem,
2636 const bracket_elem_t *end_elem)
2637 # endif /* not RE_ENABLE_I18N */
2639 unsigned int start_ch, end_ch;
2640 /* Equivalence Classes and Character Classes can't be a range start/end. */
2641 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2642 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2646 /* We can handle no multi character collating elements without libc
2648 if (BE ((start_elem->type == COLL_SYM
2649 && strlen ((char *) start_elem->opr.name) > 1)
2650 || (end_elem->type == COLL_SYM
2651 && strlen ((char *) end_elem->opr.name) > 1), 0))
2652 return REG_ECOLLATE;
2654 # ifdef RE_ENABLE_I18N
2659 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668 ? __btowc (start_ch) : start_elem->opr.wch);
2669 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670 ? __btowc (end_ch) : end_elem->opr.wch);
2671 if (start_wc == WEOF || end_wc == WEOF)
2672 return REG_ECOLLATE;
2673 cmp_buf[0] = start_wc;
2674 cmp_buf[4] = end_wc;
2676 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2677 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2680 /* Got valid collation sequence values, add them as a new entry.
2681 However, for !_LIBC we have no collation elements: if the
2682 character set is single byte, the single byte character set
2683 that we build below suffices. parse_bracket_exp passes
2684 no MBCSET if dfa->mb_cur_max == 1. */
2687 /* Check the space of the arrays. */
2688 if (BE (*range_alloc == mbcset->nranges, 0))
2690 /* There is not enough space, need realloc. */
2691 wchar_t *new_array_start, *new_array_end;
2694 /* +1 in case of mbcset->nranges is 0. */
2695 new_nranges = 2 * mbcset->nranges + 1;
2696 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697 are NULL if *range_alloc == 0. */
2698 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2700 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2703 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2706 mbcset->range_starts = new_array_start;
2707 mbcset->range_ends = new_array_end;
2708 *range_alloc = new_nranges;
2711 mbcset->range_starts[mbcset->nranges] = start_wc;
2712 mbcset->range_ends[mbcset->nranges++] = end_wc;
2715 /* Build the table for single byte characters. */
2716 for (wc = 0; wc < SBC_MAX; ++wc)
2719 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2720 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2721 bitset_set (sbcset, wc);
2724 # else /* not RE_ENABLE_I18N */
2727 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2728 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2730 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2731 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2733 if (start_ch > end_ch)
2735 /* Build the table for single byte characters. */
2736 for (ch = 0; ch < SBC_MAX; ++ch)
2737 if (start_ch <= ch && ch <= end_ch)
2738 bitset_set (sbcset, ch);
2740 # endif /* not RE_ENABLE_I18N */
2743 #endif /* not _LIBC */
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747 Build the collating element which is represented by NAME.
2748 The result are written to MBCSET and SBCSET.
2749 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750 pointer argument since we may update it. */
2752 static reg_errcode_t
2754 build_collating_symbol (bitset_t sbcset,
2755 # ifdef RE_ENABLE_I18N
2756 re_charset_t *mbcset, Idx *coll_sym_alloc,
2758 const unsigned char *name)
2760 size_t name_len = strlen ((const char *) name);
2761 if (BE (name_len != 1, 0))
2762 return REG_ECOLLATE;
2765 bitset_set (sbcset, name[0]);
2769 #endif /* not _LIBC */
2771 /* This function parse bracket expression like "[abc]", "[a-c]",
2775 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2776 reg_syntax_t syntax, reg_errcode_t *err)
2779 const unsigned char *collseqmb;
2780 const char *collseqwc;
2783 const int32_t *symb_table;
2784 const unsigned char *extra;
2786 /* Local function for parse_bracket_exp used in _LIBC environement.
2787 Seek the collating symbol entry correspondings to NAME.
2788 Return the index of the symbol in the SYMB_TABLE. */
2791 __attribute ((always_inline))
2792 seek_collating_symbol_entry (name, name_len)
2793 const unsigned char *name;
2796 int32_t hash = elem_hash ((const char *) name, name_len);
2797 int32_t elem = hash % table_size;
2798 if (symb_table[2 * elem] != 0)
2800 int32_t second = hash % (table_size - 2) + 1;
2804 /* First compare the hashing value. */
2805 if (symb_table[2 * elem] == hash
2806 /* Compare the length of the name. */
2807 && name_len == extra[symb_table[2 * elem + 1]]
2808 /* Compare the name. */
2809 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2812 /* Yep, this is the entry. */
2819 while (symb_table[2 * elem] != 0);
2824 /* Local function for parse_bracket_exp used in _LIBC environment.
2825 Look up the collation sequence value of BR_ELEM.
2826 Return the value if succeeded, UINT_MAX otherwise. */
2828 auto inline unsigned int
2829 __attribute ((always_inline))
2830 lookup_collation_sequence_value (br_elem)
2831 bracket_elem_t *br_elem;
2833 if (br_elem->type == SB_CHAR)
2836 if (MB_CUR_MAX == 1)
2839 return collseqmb[br_elem->opr.ch];
2842 wint_t wc = __btowc (br_elem->opr.ch);
2843 return __collseq_table_lookup (collseqwc, wc);
2846 else if (br_elem->type == MB_CHAR)
2849 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2851 else if (br_elem->type == COLL_SYM)
2853 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2857 elem = seek_collating_symbol_entry (br_elem->opr.name,
2859 if (symb_table[2 * elem] != 0)
2861 /* We found the entry. */
2862 idx = symb_table[2 * elem + 1];
2863 /* Skip the name of collating element name. */
2864 idx += 1 + extra[idx];
2865 /* Skip the byte sequence of the collating element. */
2866 idx += 1 + extra[idx];
2867 /* Adjust for the alignment. */
2868 idx = (idx + 3) & ~3;
2869 /* Skip the multibyte collation sequence value. */
2870 idx += sizeof (unsigned int);
2871 /* Skip the wide char sequence of the collating element. */
2872 idx += sizeof (unsigned int) *
2873 (1 + *(unsigned int *) (extra + idx));
2874 /* Return the collation sequence value. */
2875 return *(unsigned int *) (extra + idx);
2877 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2879 /* No valid character. Match it as a single byte
2881 return collseqmb[br_elem->opr.name[0]];
2884 else if (sym_name_len == 1)
2885 return collseqmb[br_elem->opr.name[0]];
2890 /* Local function for parse_bracket_exp used in _LIBC environement.
2891 Build the range expression which starts from START_ELEM, and ends
2892 at END_ELEM. The result are written to MBCSET and SBCSET.
2893 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2894 mbcset->range_ends, is a pointer argument sinse we may
2897 auto inline reg_errcode_t
2898 __attribute ((always_inline))
2899 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2900 re_charset_t *mbcset;
2903 bracket_elem_t *start_elem, *end_elem;
2906 uint32_t start_collseq;
2907 uint32_t end_collseq;
2909 /* Equivalence Classes and Character Classes can't be a range
2911 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2912 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2916 start_collseq = lookup_collation_sequence_value (start_elem);
2917 end_collseq = lookup_collation_sequence_value (end_elem);
2918 /* Check start/end collation sequence values. */
2919 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2920 return REG_ECOLLATE;
2921 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2924 /* Got valid collation sequence values, add them as a new entry.
2925 However, if we have no collation elements, and the character set
2926 is single byte, the single byte character set that we
2927 build below suffices. */
2928 if (nrules > 0 || dfa->mb_cur_max > 1)
2930 /* Check the space of the arrays. */
2931 if (BE (*range_alloc == mbcset->nranges, 0))
2933 /* There is not enough space, need realloc. */
2934 uint32_t *new_array_start;
2935 uint32_t *new_array_end;
2938 /* +1 in case of mbcset->nranges is 0. */
2939 new_nranges = 2 * mbcset->nranges + 1;
2940 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2942 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2945 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2948 mbcset->range_starts = new_array_start;
2949 mbcset->range_ends = new_array_end;
2950 *range_alloc = new_nranges;
2953 mbcset->range_starts[mbcset->nranges] = start_collseq;
2954 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2957 /* Build the table for single byte characters. */
2958 for (ch = 0; ch < SBC_MAX; ch++)
2960 uint32_t ch_collseq;
2962 if (MB_CUR_MAX == 1)
2965 ch_collseq = collseqmb[ch];
2967 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2968 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2969 bitset_set (sbcset, ch);
2974 /* Local function for parse_bracket_exp used in _LIBC environement.
2975 Build the collating element which is represented by NAME.
2976 The result are written to MBCSET and SBCSET.
2977 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2978 pointer argument sinse we may update it. */
2980 auto inline reg_errcode_t
2981 __attribute ((always_inline))
2982 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2983 re_charset_t *mbcset;
2984 Idx *coll_sym_alloc;
2986 const unsigned char *name;
2989 size_t name_len = strlen ((const char *) name);
2992 elem = seek_collating_symbol_entry (name, name_len);
2993 if (symb_table[2 * elem] != 0)
2995 /* We found the entry. */
2996 idx = symb_table[2 * elem + 1];
2997 /* Skip the name of collating element name. */
2998 idx += 1 + extra[idx];
3000 else if (symb_table[2 * elem] == 0 && name_len == 1)
3002 /* No valid character, treat it as a normal
3004 bitset_set (sbcset, name[0]);
3008 return REG_ECOLLATE;
3010 /* Got valid collation sequence, add it as a new entry. */
3011 /* Check the space of the arrays. */
3012 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3014 /* Not enough, realloc it. */
3015 /* +1 in case of mbcset->ncoll_syms is 0. */
3016 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3017 /* Use realloc since mbcset->coll_syms is NULL
3019 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3020 new_coll_sym_alloc);
3021 if (BE (new_coll_syms == NULL, 0))
3023 mbcset->coll_syms = new_coll_syms;
3024 *coll_sym_alloc = new_coll_sym_alloc;
3026 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3031 if (BE (name_len != 1, 0))
3032 return REG_ECOLLATE;
3035 bitset_set (sbcset, name[0]);
3042 re_token_t br_token;
3043 re_bitset_ptr_t sbcset;
3044 #ifdef RE_ENABLE_I18N
3045 re_charset_t *mbcset;
3046 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3047 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3048 #endif /* not RE_ENABLE_I18N */
3049 bool non_match = false;
3050 bin_tree_t *work_tree;
3052 bool first_round = true;
3054 collseqmb = (const unsigned char *)
3055 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3056 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3062 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3063 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3064 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3065 _NL_COLLATE_SYMB_TABLEMB);
3066 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3067 _NL_COLLATE_SYMB_EXTRAMB);
3070 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3071 #ifdef RE_ENABLE_I18N
3072 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3073 #endif /* RE_ENABLE_I18N */
3074 #ifdef RE_ENABLE_I18N
3075 if (BE (sbcset == NULL || mbcset == NULL, 0))
3077 if (BE (sbcset == NULL, 0))
3078 #endif /* RE_ENABLE_I18N */
3084 token_len = peek_token_bracket (token, regexp, syntax);
3085 if (BE (token->type == END_OF_RE, 0))
3088 goto parse_bracket_exp_free_return;
3090 if (token->type == OP_NON_MATCH_LIST)
3092 #ifdef RE_ENABLE_I18N
3093 mbcset->non_match = 1;
3094 #endif /* not RE_ENABLE_I18N */
3096 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3097 bitset_set (sbcset, '\n');
3098 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3099 token_len = peek_token_bracket (token, regexp, syntax);
3100 if (BE (token->type == END_OF_RE, 0))
3103 goto parse_bracket_exp_free_return;
3107 /* We treat the first ']' as a normal character. */
3108 if (token->type == OP_CLOSE_BRACKET)
3109 token->type = CHARACTER;
3113 bracket_elem_t start_elem, end_elem;
3114 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3115 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3118 bool is_range_exp = false;
3121 start_elem.opr.name = start_name_buf;
3122 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3123 syntax, first_round);
3124 if (BE (ret != REG_NOERROR, 0))
3127 goto parse_bracket_exp_free_return;
3129 first_round = false;
3131 /* Get information about the next token. We need it in any case. */
3132 token_len = peek_token_bracket (token, regexp, syntax);
3134 /* Do not check for ranges if we know they are not allowed. */
3135 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3137 if (BE (token->type == END_OF_RE, 0))
3140 goto parse_bracket_exp_free_return;
3142 if (token->type == OP_CHARSET_RANGE)
3144 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3145 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3146 if (BE (token2.type == END_OF_RE, 0))
3149 goto parse_bracket_exp_free_return;
3151 if (token2.type == OP_CLOSE_BRACKET)
3153 /* We treat the last '-' as a normal character. */
3154 re_string_skip_bytes (regexp, -token_len);
3155 token->type = CHARACTER;
3158 is_range_exp = true;
3162 if (is_range_exp == true)
3164 end_elem.opr.name = end_name_buf;
3165 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3167 if (BE (ret != REG_NOERROR, 0))
3170 goto parse_bracket_exp_free_return;
3173 token_len = peek_token_bracket (token, regexp, syntax);
3176 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3177 &start_elem, &end_elem);
3179 # ifdef RE_ENABLE_I18N
3180 *err = build_range_exp (syntax, sbcset,
3181 dfa->mb_cur_max > 1 ? mbcset : NULL,
3182 &range_alloc, &start_elem, &end_elem);
3184 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3186 #endif /* RE_ENABLE_I18N */
3187 if (BE (*err != REG_NOERROR, 0))
3188 goto parse_bracket_exp_free_return;
3192 switch (start_elem.type)
3195 bitset_set (sbcset, start_elem.opr.ch);
3197 #ifdef RE_ENABLE_I18N
3199 /* Check whether the array has enough space. */
3200 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3202 wchar_t *new_mbchars;
3203 /* Not enough, realloc it. */
3204 /* +1 in case of mbcset->nmbchars is 0. */
3205 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3206 /* Use realloc since array is NULL if *alloc == 0. */
3207 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3209 if (BE (new_mbchars == NULL, 0))
3210 goto parse_bracket_exp_espace;
3211 mbcset->mbchars = new_mbchars;
3213 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3215 #endif /* RE_ENABLE_I18N */
3217 *err = build_equiv_class (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219 mbcset, &equiv_class_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221 start_elem.opr.name);
3222 if (BE (*err != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3226 *err = build_collating_symbol (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228 mbcset, &coll_sym_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230 start_elem.opr.name);
3231 if (BE (*err != REG_NOERROR, 0))
3232 goto parse_bracket_exp_free_return;
3235 *err = build_charclass (regexp->trans, sbcset,
3236 #ifdef RE_ENABLE_I18N
3237 mbcset, &char_class_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239 start_elem.opr.name, syntax);
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3248 if (BE (token->type == END_OF_RE, 0))
3251 goto parse_bracket_exp_free_return;
3253 if (token->type == OP_CLOSE_BRACKET)
3257 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3259 /* If it is non-matching list. */
3261 bitset_not (sbcset);
3263 #ifdef RE_ENABLE_I18N
3264 /* Ensure only single byte characters are set. */
3265 if (dfa->mb_cur_max > 1)
3266 bitset_mask (sbcset, dfa->sb_char);
3268 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3269 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3270 || mbcset->non_match)))
3272 bin_tree_t *mbc_tree;
3274 /* Build a tree for complex bracket. */
3275 dfa->has_mb_node = 1;
3276 br_token.type = COMPLEX_BRACKET;
3277 br_token.opr.mbcset = mbcset;
3278 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3279 if (BE (mbc_tree == NULL, 0))
3280 goto parse_bracket_exp_espace;
3281 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3282 if (sbcset[sbc_idx])
3284 /* If there are no bits set in sbcset, there is no point
3285 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3286 if (sbc_idx < BITSET_WORDS)
3288 /* Build a tree for simple bracket. */
3289 br_token.type = SIMPLE_BRACKET;
3290 br_token.opr.sbcset = sbcset;
3291 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292 if (BE (work_tree == NULL, 0))
3293 goto parse_bracket_exp_espace;
3295 /* Then join them by ALT node. */
3296 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3297 if (BE (work_tree == NULL, 0))
3298 goto parse_bracket_exp_espace;
3303 work_tree = mbc_tree;
3307 #endif /* not RE_ENABLE_I18N */
3309 #ifdef RE_ENABLE_I18N
3310 free_charset (mbcset);
3312 /* Build a tree for simple bracket. */
3313 br_token.type = SIMPLE_BRACKET;
3314 br_token.opr.sbcset = sbcset;
3315 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3316 if (BE (work_tree == NULL, 0))
3317 goto parse_bracket_exp_espace;
3321 parse_bracket_exp_espace:
3323 parse_bracket_exp_free_return:
3325 #ifdef RE_ENABLE_I18N
3326 free_charset (mbcset);
3327 #endif /* RE_ENABLE_I18N */
3331 /* Parse an element in the bracket expression. */
3333 static reg_errcode_t
3334 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3335 re_token_t *token, int token_len, re_dfa_t *dfa,
3336 reg_syntax_t syntax, bool accept_hyphen)
3338 #ifdef RE_ENABLE_I18N
3340 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3341 if (cur_char_size > 1)
3343 elem->type = MB_CHAR;
3344 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3345 re_string_skip_bytes (regexp, cur_char_size);
3348 #endif /* RE_ENABLE_I18N */
3349 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3350 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3351 || token->type == OP_OPEN_EQUIV_CLASS)
3352 return parse_bracket_symbol (elem, regexp, token);
3353 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3355 /* A '-' must only appear as anything but a range indicator before
3356 the closing bracket. Everything else is an error. */
3358 (void) peek_token_bracket (&token2, regexp, syntax);
3359 if (token2.type != OP_CLOSE_BRACKET)
3360 /* The actual error value is not standardized since this whole
3361 case is undefined. But ERANGE makes good sense. */
3364 elem->type = SB_CHAR;
3365 elem->opr.ch = token->opr.c;
3369 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3370 such as [:<character_class>:], [.<collating_element>.], and
3371 [=<equivalent_class>=]. */
3373 static reg_errcode_t
3374 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3377 unsigned char ch, delim = token->opr.c;
3379 if (re_string_eoi(regexp))
3383 if (i >= BRACKET_NAME_BUF_SIZE)
3385 if (token->type == OP_OPEN_CHAR_CLASS)
3386 ch = re_string_fetch_byte_case (regexp);
3388 ch = re_string_fetch_byte (regexp);
3389 if (re_string_eoi(regexp))
3391 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3393 elem->opr.name[i] = ch;
3395 re_string_skip_bytes (regexp, 1);
3396 elem->opr.name[i] = '\0';
3397 switch (token->type)
3399 case OP_OPEN_COLL_ELEM:
3400 elem->type = COLL_SYM;
3402 case OP_OPEN_EQUIV_CLASS:
3403 elem->type = EQUIV_CLASS;
3405 case OP_OPEN_CHAR_CLASS:
3406 elem->type = CHAR_CLASS;
3414 /* Helper function for parse_bracket_exp.
3415 Build the equivalence class which is represented by NAME.
3416 The result are written to MBCSET and SBCSET.
3417 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3418 is a pointer argument sinse we may update it. */
3420 static reg_errcode_t
3421 #ifdef RE_ENABLE_I18N
3422 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3423 Idx *equiv_class_alloc, const unsigned char *name)
3424 #else /* not RE_ENABLE_I18N */
3425 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3426 #endif /* not RE_ENABLE_I18N */
3429 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3432 const int32_t *table, *indirect;
3433 const unsigned char *weights, *extra, *cp;
3434 unsigned char char_buf[2];
3438 /* This #include defines a local function! */
3439 # include <locale/weight.h>
3440 /* Calculate the index for equivalence class. */
3442 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3443 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3444 _NL_COLLATE_WEIGHTMB);
3445 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3446 _NL_COLLATE_EXTRAMB);
3447 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3448 _NL_COLLATE_INDIRECTMB);
3449 idx1 = findidx (&cp);
3450 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3451 /* This isn't a valid character. */
3452 return REG_ECOLLATE;
3454 /* Build single byte matcing table for this equivalence class. */
3455 char_buf[1] = (unsigned char) '\0';
3456 len = weights[idx1 & 0xffffff];
3457 for (ch = 0; ch < SBC_MAX; ++ch)
3461 idx2 = findidx (&cp);
3466 /* This isn't a valid character. */
3468 /* Compare only if the length matches and the collation rule
3469 index is the same. */
3470 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3474 while (cnt <= len &&
3475 weights[(idx1 & 0xffffff) + 1 + cnt]
3476 == weights[(idx2 & 0xffffff) + 1 + cnt])
3480 bitset_set (sbcset, ch);
3483 /* Check whether the array has enough space. */
3484 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3486 /* Not enough, realloc it. */
3487 /* +1 in case of mbcset->nequiv_classes is 0. */
3488 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3489 /* Use realloc since the array is NULL if *alloc == 0. */
3490 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3492 new_equiv_class_alloc);
3493 if (BE (new_equiv_classes == NULL, 0))
3495 mbcset->equiv_classes = new_equiv_classes;
3496 *equiv_class_alloc = new_equiv_class_alloc;
3498 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3503 if (BE (strlen ((const char *) name) != 1, 0))
3504 return REG_ECOLLATE;
3505 bitset_set (sbcset, *name);
3510 /* Helper function for parse_bracket_exp.
3511 Build the character class which is represented by NAME.
3512 The result are written to MBCSET and SBCSET.
3513 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3514 is a pointer argument sinse we may update it. */
3516 static reg_errcode_t
3517 #ifdef RE_ENABLE_I18N
3518 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3519 re_charset_t *mbcset, Idx *char_class_alloc,
3520 const unsigned char *class_name, reg_syntax_t syntax)
3521 #else /* not RE_ENABLE_I18N */
3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523 const unsigned char *class_name, reg_syntax_t syntax)
3524 #endif /* not RE_ENABLE_I18N */
3527 const char *name = (const char *) class_name;
3529 /* In case of REG_ICASE "upper" and "lower" match the both of
3530 upper and lower cases. */
3531 if ((syntax & RE_ICASE)
3532 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3535 #ifdef RE_ENABLE_I18N
3536 /* Check the space of the arrays. */
3537 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3539 /* Not enough, realloc it. */
3540 /* +1 in case of mbcset->nchar_classes is 0. */
3541 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3542 /* Use realloc since array is NULL if *alloc == 0. */
3543 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3544 new_char_class_alloc);
3545 if (BE (new_char_classes == NULL, 0))
3547 mbcset->char_classes = new_char_classes;
3548 *char_class_alloc = new_char_class_alloc;
3550 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3551 #endif /* RE_ENABLE_I18N */
3553 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3555 if (BE (trans != NULL, 0)) \
3557 for (i = 0; i < SBC_MAX; ++i) \
3558 if (ctype_func (i)) \
3559 bitset_set (sbcset, trans[i]); \
3563 for (i = 0; i < SBC_MAX; ++i) \
3564 if (ctype_func (i)) \
3565 bitset_set (sbcset, i); \
3569 if (strcmp (name, "alnum") == 0)
3570 BUILD_CHARCLASS_LOOP (isalnum);
3571 else if (strcmp (name, "cntrl") == 0)
3572 BUILD_CHARCLASS_LOOP (iscntrl);
3573 else if (strcmp (name, "lower") == 0)
3574 BUILD_CHARCLASS_LOOP (islower);
3575 else if (strcmp (name, "space") == 0)
3576 BUILD_CHARCLASS_LOOP (isspace);
3577 else if (strcmp (name, "alpha") == 0)
3578 BUILD_CHARCLASS_LOOP (isalpha);
3579 else if (strcmp (name, "digit") == 0)
3580 BUILD_CHARCLASS_LOOP (isdigit);
3581 else if (strcmp (name, "print") == 0)
3582 BUILD_CHARCLASS_LOOP (isprint);
3583 else if (strcmp (name, "upper") == 0)
3584 BUILD_CHARCLASS_LOOP (isupper);
3585 else if (strcmp (name, "blank") == 0)
3586 BUILD_CHARCLASS_LOOP (isblank);
3587 else if (strcmp (name, "graph") == 0)
3588 BUILD_CHARCLASS_LOOP (isgraph);
3589 else if (strcmp (name, "punct") == 0)
3590 BUILD_CHARCLASS_LOOP (ispunct);
3591 else if (strcmp (name, "xdigit") == 0)
3592 BUILD_CHARCLASS_LOOP (isxdigit);
3600 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3601 const unsigned char *class_name,
3602 const unsigned char *extra, bool non_match,
3605 re_bitset_ptr_t sbcset;
3606 #ifdef RE_ENABLE_I18N
3607 re_charset_t *mbcset;
3609 #endif /* not RE_ENABLE_I18N */
3611 re_token_t br_token;
3614 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3615 #ifdef RE_ENABLE_I18N
3616 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3617 #endif /* RE_ENABLE_I18N */
3619 #ifdef RE_ENABLE_I18N
3620 if (BE (sbcset == NULL || mbcset == NULL, 0))
3621 #else /* not RE_ENABLE_I18N */
3622 if (BE (sbcset == NULL, 0))
3623 #endif /* not RE_ENABLE_I18N */
3631 #ifdef RE_ENABLE_I18N
3632 mbcset->non_match = 1;
3633 #endif /* not RE_ENABLE_I18N */
3636 /* We don't care the syntax in this case. */
3637 ret = build_charclass (trans, sbcset,
3638 #ifdef RE_ENABLE_I18N
3640 #endif /* RE_ENABLE_I18N */
3643 if (BE (ret != REG_NOERROR, 0))
3646 #ifdef RE_ENABLE_I18N
3647 free_charset (mbcset);
3648 #endif /* RE_ENABLE_I18N */
3652 /* \w match '_' also. */
3653 for (; *extra; extra++)
3654 bitset_set (sbcset, *extra);
3656 /* If it is non-matching list. */
3658 bitset_not (sbcset);
3660 #ifdef RE_ENABLE_I18N
3661 /* Ensure only single byte characters are set. */
3662 if (dfa->mb_cur_max > 1)
3663 bitset_mask (sbcset, dfa->sb_char);
3666 /* Build a tree for simple bracket. */
3667 br_token.type = SIMPLE_BRACKET;
3668 br_token.opr.sbcset = sbcset;
3669 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3670 if (BE (tree == NULL, 0))
3671 goto build_word_op_espace;
3673 #ifdef RE_ENABLE_I18N
3674 if (dfa->mb_cur_max > 1)
3676 bin_tree_t *mbc_tree;
3677 /* Build a tree for complex bracket. */
3678 br_token.type = COMPLEX_BRACKET;
3679 br_token.opr.mbcset = mbcset;
3680 dfa->has_mb_node = 1;
3681 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3682 if (BE (mbc_tree == NULL, 0))
3683 goto build_word_op_espace;
3684 /* Then join them by ALT node. */
3685 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3686 if (BE (mbc_tree != NULL, 1))
3691 free_charset (mbcset);
3694 #else /* not RE_ENABLE_I18N */
3696 #endif /* not RE_ENABLE_I18N */
3698 build_word_op_espace:
3700 #ifdef RE_ENABLE_I18N
3701 free_charset (mbcset);
3702 #endif /* RE_ENABLE_I18N */
3707 /* This is intended for the expressions like "a{1,3}".
3708 Fetch a number from `input', and return the number.
3709 Return REG_MISSING if the number field is empty like "{,1}".
3710 Return REG_ERROR if an error occurred. */
3713 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3715 Idx num = REG_MISSING;
3719 fetch_token (token, input, syntax);
3721 if (BE (token->type == END_OF_RE, 0))
3723 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3725 num = ((token->type != CHARACTER || c < '0' || '9' < c
3726 || num == REG_ERROR)
3728 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3729 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3734 #ifdef RE_ENABLE_I18N
3736 free_charset (re_charset_t *cset)
3738 re_free (cset->mbchars);
3740 re_free (cset->coll_syms);
3741 re_free (cset->equiv_classes);
3742 re_free (cset->range_starts);
3743 re_free (cset->range_ends);
3745 re_free (cset->char_classes);
3748 #endif /* RE_ENABLE_I18N */
3750 /* Functions for binary tree operation. */
3752 /* Create a tree node. */
3755 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3756 re_token_type_t type)
3760 return create_token_tree (dfa, left, right, &t);
3764 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3765 const re_token_t *token)
3768 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3770 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3772 if (storage == NULL)
3774 storage->next = dfa->str_tree_storage;
3775 dfa->str_tree_storage = storage;
3776 dfa->str_tree_storage_idx = 0;
3778 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3780 tree->parent = NULL;
3782 tree->right = right;
3783 tree->token = *token;
3784 tree->token.duplicated = 0;
3785 tree->token.opt_subexp = 0;
3788 tree->node_idx = REG_MISSING;
3791 left->parent = tree;
3793 right->parent = tree;
3797 /* Mark the tree SRC as an optional subexpression.
3798 To be called from preorder or postorder. */
3800 static reg_errcode_t
3801 mark_opt_subexp (void *extra, bin_tree_t *node)
3803 Idx idx = (Idx) (long) extra;
3804 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3805 node->token.opt_subexp = 1;
3810 /* Free the allocated memory inside NODE. */
3813 free_token (re_token_t *node)
3815 #ifdef RE_ENABLE_I18N
3816 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3817 free_charset (node->opr.mbcset);
3819 #endif /* RE_ENABLE_I18N */
3820 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3821 re_free (node->opr.sbcset);
3824 /* Worker function for tree walking. Free the allocated memory inside NODE
3825 and its children. */
3827 static reg_errcode_t
3828 free_tree (void *extra, bin_tree_t *node)
3830 free_token (&node->token);
3835 /* Duplicate the node SRC, and return new node. This is a preorder
3836 visit similar to the one implemented by the generic visitor, but
3837 we need more infrastructure to maintain two parallel trees --- so,
3838 it's easier to duplicate. */
3841 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3843 const bin_tree_t *node;
3844 bin_tree_t *dup_root;
3845 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3847 for (node = root; ; )
3849 /* Create a new tree and link it back to the current parent. */
3850 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3853 (*p_new)->parent = dup_node;
3854 (*p_new)->token.duplicated = 1;
3857 /* Go to the left node, or up and to the right. */
3861 p_new = &dup_node->left;
3865 const bin_tree_t *prev = NULL;
3866 while (node->right == prev || node->right == NULL)
3869 node = node->parent;
3870 dup_node = dup_node->parent;
3875 p_new = &dup_node->right;