1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License along
19 with this program; if not, write to the Free Software Foundation,
20 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 const re_dfastate_t *init_state,
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 static void optimize_utf8 (re_dfa_t *dfa);
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
52 unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 static int peek_token (re_token_t *token, re_string_t *input,
60 reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 Idx nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 re_dfa_t *dfa, re_token_t *token,
77 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 re_token_t *token, reg_syntax_t syntax,
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_token_t *token, int token_len,
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 Idx *equiv_class_alloc,
94 const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 Idx *char_class_alloc,
99 const unsigned char *class_name,
100 reg_syntax_t syntax);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 const unsigned char *class_name,
107 reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 RE_TRANSLATE_TYPE trans,
111 const unsigned char *class_name,
112 const unsigned char *extra,
113 bool non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 bin_tree_t *left, bin_tree_t *right,
119 const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 static const char __re_error_msgid[] =
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 static const size_t __re_error_msgid_idx[] =
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
216 re_compile_pattern (pattern, length, bufp)
219 struct re_pattern_buffer *bufp;
220 #else /* size_t might promote */
222 re_compile_pattern (const char *pattern, size_t length,
223 struct re_pattern_buffer *bufp)
228 /* And GNU code determines whether or not to get register information
229 by passing null for the REGS argument to re_match, etc., not by
230 setting no_sub, unless RE_NO_SUB is set. */
231 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233 /* Match anchors at newline. */
234 bufp->newline_anchor = 1;
236 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
240 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
243 weak_alias (__re_compile_pattern, re_compile_pattern)
246 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
247 also be assigned to arbitrarily: each pattern buffer stores its own
248 syntax, so it can be changed between regex compilations. */
249 /* This has no initializer because initialized variables in Emacs
250 become read-only after dumping. */
251 reg_syntax_t re_syntax_options;
254 /* Specify the precise syntax of regexps for compilation. This provides
255 for compatibility for various utilities which historically have
256 different, incompatible syntaxes.
258 The argument SYNTAX is a bit mask comprised of the various bits
259 defined in regex.h. We return the old syntax. */
262 re_set_syntax (syntax)
265 reg_syntax_t ret = re_syntax_options;
267 re_syntax_options = syntax;
271 weak_alias (__re_set_syntax, re_set_syntax)
275 re_compile_fastmap (bufp)
276 struct re_pattern_buffer *bufp;
278 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
279 char *fastmap = bufp->fastmap;
281 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
282 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
283 if (dfa->init_state != dfa->init_state_word)
284 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
285 if (dfa->init_state != dfa->init_state_nl)
286 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
287 if (dfa->init_state != dfa->init_state_begbuf)
288 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
289 bufp->fastmap_accurate = 1;
293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
297 __attribute ((always_inline))
298 re_set_fastmap (char *fastmap, bool icase, int ch)
302 fastmap[tolower (ch)] = 1;
305 /* Helper function for re_compile_fastmap.
306 Compile fastmap for the initial_state INIT_STATE. */
309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
312 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
314 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
315 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317 Idx node = init_state->nodes.elems[node_cnt];
318 re_token_type_t type = dfa->nodes[node].type;
320 if (type == CHARACTER)
322 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
323 #ifdef RE_ENABLE_I18N
324 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 unsigned char buf[MB_LEN_MAX];
332 *p++ = dfa->nodes[node].opr.c;
333 while (++node < dfa->nodes_len
334 && dfa->nodes[node].type == CHARACTER
335 && dfa->nodes[node].mb_partial)
336 *p++ = dfa->nodes[node].opr.c;
337 memset (&state, '\0', sizeof (state));
338 if (mbrtowc (&wc, (const char *) buf, p - buf,
340 && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 re_set_fastmap (fastmap, false, buf[0]);
346 else if (type == SIMPLE_BRACKET)
349 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
352 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
353 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
354 if (w & ((bitset_word_t) 1 << j))
355 re_set_fastmap (fastmap, icase, ch);
358 #ifdef RE_ENABLE_I18N
359 else if (type == COMPLEX_BRACKET)
362 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
364 || cset->nranges || cset->nchar_classes)
367 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
369 /* In this case we want to catch the bytes which are
370 the first byte of any collation elements.
371 e.g. In da_DK, we want to catch 'a' since "aa"
372 is a valid collation element, and don't catch
373 'b' since 'b' is the only collation element
374 which starts from 'b'. */
375 const int32_t *table = (const int32_t *)
376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 for (i = 0; i < SBC_MAX; ++i)
379 re_set_fastmap (fastmap, icase, i);
382 if (dfa->mb_cur_max > 1)
383 for (i = 0; i < SBC_MAX; ++i)
384 if (__btowc (i) == WEOF)
385 re_set_fastmap (fastmap, icase, i);
386 # endif /* not _LIBC */
388 for (i = 0; i < cset->nmbchars; ++i)
392 memset (&state, '\0', sizeof (state));
393 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
394 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
395 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
397 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
399 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
403 #endif /* RE_ENABLE_I18N */
404 else if (type == OP_PERIOD
405 #ifdef RE_ENABLE_I18N
406 || type == OP_UTF8_PERIOD
407 #endif /* RE_ENABLE_I18N */
408 || type == END_OF_RE)
410 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
411 if (type == END_OF_RE)
412 bufp->can_be_null = 1;
418 /* Entry point for POSIX code. */
419 /* regcomp takes a regular expression as a string and compiles it.
421 PREG is a regex_t *. We do not expect any fields to be initialized,
422 since POSIX says we shouldn't. Thus, we set
424 `buffer' to the compiled pattern;
425 `used' to the length of the compiled pattern;
426 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
427 REG_EXTENDED bit in CFLAGS is set; otherwise, to
428 RE_SYNTAX_POSIX_BASIC;
429 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
430 `fastmap' to an allocated space for the fastmap;
431 `fastmap_accurate' to zero;
432 `re_nsub' to the number of subexpressions in PATTERN.
434 PATTERN is the address of the pattern string.
436 CFLAGS is a series of bits which affect compilation.
438 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
439 use POSIX basic syntax.
441 If REG_NEWLINE is set, then . and [^...] don't match newline.
442 Also, regexec will try a match beginning after every newline.
444 If REG_ICASE is set, then we considers upper- and lowercase
445 versions of letters to be equivalent when matching.
447 If REG_NOSUB is set, then when PREG is passed to regexec, that
448 routine will report only success or failure, and nothing about the
451 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
452 the return codes and their meanings.) */
455 regcomp (preg, pattern, cflags)
456 regex_t *_Restrict_ preg;
457 const char *_Restrict_ pattern;
461 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
462 : RE_SYNTAX_POSIX_BASIC);
468 /* Try to allocate space for the fastmap. */
469 preg->fastmap = re_malloc (char, SBC_MAX);
470 if (BE (preg->fastmap == NULL, 0))
473 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
475 /* If REG_NEWLINE is set, newlines are treated differently. */
476 if (cflags & REG_NEWLINE)
477 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
478 syntax &= ~RE_DOT_NEWLINE;
479 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
480 /* It also changes the matching behavior. */
481 preg->newline_anchor = 1;
484 preg->newline_anchor = 0;
485 preg->no_sub = !!(cflags & REG_NOSUB);
486 preg->translate = NULL;
488 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
490 /* POSIX doesn't distinguish between an unmatched open-group and an
491 unmatched close-group: both are REG_EPAREN. */
492 if (ret == REG_ERPAREN)
495 /* We have already checked preg->fastmap != NULL. */
496 if (BE (ret == REG_NOERROR, 1))
497 /* Compute the fastmap now, since regexec cannot modify the pattern
498 buffer. This function never fails in this implementation. */
499 (void) re_compile_fastmap (preg);
502 /* Some error occurred while compiling the expression. */
503 re_free (preg->fastmap);
504 preg->fastmap = NULL;
510 weak_alias (__regcomp, regcomp)
513 /* Returns a message corresponding to an error code, ERRCODE, returned
514 from either regcomp or regexec. We don't use PREG here. */
518 regerror (errcode, preg, errbuf, errbuf_size)
520 const regex_t *_Restrict_ preg;
521 char *_Restrict_ errbuf;
523 #else /* size_t might promote */
525 regerror (int errcode, const regex_t *_Restrict_ preg,
526 char *_Restrict_ errbuf, size_t errbuf_size)
533 || errcode >= (int) (sizeof (__re_error_msgid_idx)
534 / sizeof (__re_error_msgid_idx[0])), 0))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
541 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543 msg_size = strlen (msg) + 1; /* Includes the null. */
545 if (BE (errbuf_size != 0, 1))
547 size_t cpy_size = msg_size;
548 if (BE (msg_size > errbuf_size, 0))
550 cpy_size = errbuf_size - 1;
551 errbuf[cpy_size] = '\0';
553 memcpy (errbuf, msg, cpy_size);
559 weak_alias (__regerror, regerror)
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565 UTF-8 is used. Otherwise we would allocate memory just to initialize
566 it the same all the time. UTF-8 is the preferred encoding so this is
567 a worthwhile optimization. */
568 static const bitset_t utf8_sb_map =
570 /* Set the first 128 bits. */
571 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
572 # error "bitset_word_t is narrower than 32 bits"
573 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
574 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
575 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
576 BITSET_WORD_MAX, BITSET_WORD_MAX,
577 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
581 >> (SBC_MAX % BITSET_WORD_BITS == 0
583 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
589 free_dfa_content (re_dfa_t *dfa)
594 for (i = 0; i < dfa->nodes_len; ++i)
595 free_token (dfa->nodes + i);
596 re_free (dfa->nexts);
597 for (i = 0; i < dfa->nodes_len; ++i)
599 if (dfa->eclosures != NULL)
600 re_node_set_free (dfa->eclosures + i);
601 if (dfa->inveclosures != NULL)
602 re_node_set_free (dfa->inveclosures + i);
603 if (dfa->edests != NULL)
604 re_node_set_free (dfa->edests + i);
606 re_free (dfa->edests);
607 re_free (dfa->eclosures);
608 re_free (dfa->inveclosures);
609 re_free (dfa->nodes);
611 if (dfa->state_table)
612 for (i = 0; i <= dfa->state_hash_mask; ++i)
614 struct re_state_table_entry *entry = dfa->state_table + i;
615 for (j = 0; j < entry->num; ++j)
617 re_dfastate_t *state = entry->array[j];
620 re_free (entry->array);
622 re_free (dfa->state_table);
623 #ifdef RE_ENABLE_I18N
624 if (dfa->sb_char != utf8_sb_map)
625 re_free (dfa->sb_char);
627 re_free (dfa->subexp_map);
629 re_free (dfa->re_str);
636 /* Free dynamically allocated space used by PREG. */
642 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
643 if (BE (dfa != NULL, 1))
644 free_dfa_content (dfa);
648 re_free (preg->fastmap);
649 preg->fastmap = NULL;
651 re_free (preg->translate);
652 preg->translate = NULL;
655 weak_alias (__regfree, regfree)
658 /* Entry points compatible with 4.2 BSD regex library. We don't define
659 them unless specifically requested. */
661 #if defined _REGEX_RE_COMP || defined _LIBC
663 /* BSD has one and only one pattern buffer. */
664 static struct re_pattern_buffer re_comp_buf;
668 /* Make these definitions weak in libc, so POSIX programs can redefine
669 these names if they don't use our functions, and still use
670 regcomp/regexec above without link errors. */
681 if (!re_comp_buf.buffer)
682 return gettext ("No previous regular expression");
686 if (re_comp_buf.buffer)
688 fastmap = re_comp_buf.fastmap;
689 re_comp_buf.fastmap = NULL;
690 __regfree (&re_comp_buf);
691 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
692 re_comp_buf.fastmap = fastmap;
695 if (re_comp_buf.fastmap == NULL)
697 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
698 if (re_comp_buf.fastmap == NULL)
699 return (char *) gettext (__re_error_msgid
700 + __re_error_msgid_idx[(int) REG_ESPACE]);
703 /* Since `re_exec' always passes NULL for the `regs' argument, we
704 don't need to initialize the pattern buffer fields which affect it. */
706 /* Match anchors at newlines. */
707 re_comp_buf.newline_anchor = 1;
709 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
714 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
715 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
719 libc_freeres_fn (free_mem)
721 __regfree (&re_comp_buf);
725 #endif /* _REGEX_RE_COMP */
727 /* Internal entry point.
728 Compile the regular expression PATTERN, whose length is LENGTH.
729 SYNTAX indicate regular expression's syntax. */
732 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
735 reg_errcode_t err = REG_NOERROR;
739 /* Initialize the pattern buffer. */
740 preg->fastmap_accurate = 0;
741 preg->syntax = syntax;
742 preg->not_bol = preg->not_eol = 0;
745 preg->can_be_null = 0;
746 preg->regs_allocated = REGS_UNALLOCATED;
748 /* Initialize the dfa. */
749 dfa = (re_dfa_t *) preg->buffer;
750 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
752 /* If zero allocated, but buffer is non-null, try to realloc
753 enough space. This loses if buffer's address is bogus, but
754 that is the user's responsibility. If ->buffer is NULL this
755 is a simple allocation. */
756 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
759 preg->allocated = sizeof (re_dfa_t);
760 preg->buffer = (unsigned char *) dfa;
762 preg->used = sizeof (re_dfa_t);
764 err = init_dfa (dfa, length);
765 if (BE (err != REG_NOERROR, 0))
767 free_dfa_content (dfa);
773 /* Note: length+1 will not overflow since it is checked in init_dfa. */
774 dfa->re_str = re_malloc (char, length + 1);
775 strncpy (dfa->re_str, pattern, length + 1);
778 __libc_lock_init (dfa->lock);
780 err = re_string_construct (®exp, pattern, length, preg->translate,
781 syntax & RE_ICASE, dfa);
782 if (BE (err != REG_NOERROR, 0))
784 re_compile_internal_free_return:
785 free_workarea_compile (preg);
786 re_string_destruct (®exp);
787 free_dfa_content (dfa);
793 /* Parse the regular expression, and build a structure tree. */
795 dfa->str_tree = parse (®exp, preg, syntax, &err);
796 if (BE (dfa->str_tree == NULL, 0))
797 goto re_compile_internal_free_return;
799 /* Analyze the tree and create the nfa. */
800 err = analyze (preg);
801 if (BE (err != REG_NOERROR, 0))
802 goto re_compile_internal_free_return;
804 #ifdef RE_ENABLE_I18N
805 /* If possible, do searching in single byte encoding to speed things up. */
806 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
810 /* Then create the initial state of the dfa. */
811 err = create_initial_state (dfa);
813 /* Release work areas. */
814 free_workarea_compile (preg);
815 re_string_destruct (®exp);
817 if (BE (err != REG_NOERROR, 0))
819 free_dfa_content (dfa);
827 /* Initialize DFA. We use the length of the regular expression PAT_LEN
828 as the initial length of some arrays. */
831 init_dfa (re_dfa_t *dfa, size_t pat_len)
833 __re_size_t table_size;
834 #ifdef RE_ENABLE_I18N
835 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
837 size_t max_i18n_object_size = 0;
839 size_t max_object_size =
840 MAX (sizeof (struct re_state_table_entry),
841 MAX (sizeof (re_token_t),
842 MAX (sizeof (re_node_set),
843 MAX (sizeof (regmatch_t),
844 max_i18n_object_size))));
846 memset (dfa, '\0', sizeof (re_dfa_t));
848 /* Force allocation of str_tree_storage the first time. */
849 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
851 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
852 calculation below, and for similar doubling calculations
853 elsewhere. And it's <= rather than <, because some of the
854 doubling calculations add 1 afterwards. */
855 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
869 dfa->mb_cur_max = MB_CUR_MAX;
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
877 if (strcmp (locale_charset (), "UTF-8") == 0)
880 /* We check exhaustively in the loop below if this charset is a
881 superset of ASCII. */
882 dfa->map_notascii = 0;
885 #ifdef RE_ENABLE_I18N
886 if (dfa->mb_cur_max > 1)
889 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
895 if (BE (dfa->sb_char == NULL, 0))
898 /* Set the bits corresponding to single byte chars. */
899 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
900 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902 wint_t wch = __btowc (ch);
904 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906 if (isascii (ch) && wch != ch)
907 dfa->map_notascii = 1;
914 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
919 /* Initialize WORD_CHAR table, which indicate which character is
920 "word". In this case "word" means that it is the word construction
921 character used by some operators like "\<", "\>", etc. */
925 init_word_char (re_dfa_t *dfa)
928 dfa->word_ops_used = 1;
929 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
930 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
931 if (isalnum (ch) || ch == '_')
932 dfa->word_char[i] |= (bitset_word_t) 1 << j;
935 /* Free the work area which are only used while compiling. */
938 free_workarea_compile (regex_t *preg)
940 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
941 bin_tree_storage_t *storage, *next;
942 for (storage = dfa->str_tree_storage; storage; storage = next)
944 next = storage->next;
947 dfa->str_tree_storage = NULL;
948 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
949 dfa->str_tree = NULL;
950 re_free (dfa->org_indices);
951 dfa->org_indices = NULL;
954 /* Create initial states for all contexts. */
957 create_initial_state (re_dfa_t *dfa)
961 re_node_set init_nodes;
963 /* Initial states have the epsilon closure of the node which is
964 the first node of the regular expression. */
965 first = dfa->str_tree->first->node_idx;
966 dfa->init_node = first;
967 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
968 if (BE (err != REG_NOERROR, 0))
971 /* The back-references which are in initial states can epsilon transit,
972 since in this case all of the subexpressions can be null.
973 Then we add epsilon closures of the nodes which are the next nodes of
974 the back-references. */
975 if (dfa->nbackref > 0)
976 for (i = 0; i < init_nodes.nelem; ++i)
978 Idx node_idx = init_nodes.elems[i];
979 re_token_type_t type = dfa->nodes[node_idx].type;
982 if (type != OP_BACK_REF)
984 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
986 re_token_t *clexp_node;
987 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
988 if (clexp_node->type == OP_CLOSE_SUBEXP
989 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
992 if (clexp_idx == init_nodes.nelem)
995 if (type == OP_BACK_REF)
997 Idx dest_idx = dfa->edests[node_idx].elems[0];
998 if (!re_node_set_contains (&init_nodes, dest_idx))
1000 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006 /* It must be the first time to invoke acquire_state. */
1007 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1008 /* We don't check ERR here, since the initial state must not be NULL. */
1009 if (BE (dfa->init_state == NULL, 0))
1011 if (dfa->init_state->has_constraint)
1013 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1015 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1017 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1021 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1022 || dfa->init_state_begbuf == NULL, 0))
1026 dfa->init_state_word = dfa->init_state_nl
1027 = dfa->init_state_begbuf = dfa->init_state;
1029 re_node_set_free (&init_nodes);
1033 #ifdef RE_ENABLE_I18N
1034 /* If it is possible to do searching in single byte encoding instead of UTF-8
1035 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1036 DFA nodes where needed. */
1039 optimize_utf8 (re_dfa_t *dfa)
1043 bool mb_chars = false;
1044 bool has_period = false;
1046 for (node = 0; node < dfa->nodes_len; ++node)
1047 switch (dfa->nodes[node].type)
1050 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1054 switch (dfa->nodes[node].opr.ctx_type)
1062 /* Word anchors etc. cannot be handled. */
1072 case OP_DUP_ASTERISK:
1073 case OP_OPEN_SUBEXP:
1074 case OP_CLOSE_SUBEXP:
1076 case COMPLEX_BRACKET:
1078 case SIMPLE_BRACKET:
1079 /* Just double check. */
1081 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1083 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1084 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1086 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096 if (mb_chars || has_period)
1097 for (node = 0; node < dfa->nodes_len; ++node)
1099 if (dfa->nodes[node].type == CHARACTER
1100 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1101 dfa->nodes[node].mb_partial = 0;
1102 else if (dfa->nodes[node].type == OP_PERIOD)
1103 dfa->nodes[node].type = OP_UTF8_PERIOD;
1106 /* The search can be in single byte locale. */
1107 dfa->mb_cur_max = 1;
1109 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1113 /* Analyze the structure tree, and calculate "first", "next", "edest",
1114 "eclosure", and "inveclosure". */
1116 static reg_errcode_t
1117 analyze (regex_t *preg)
1119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1122 /* Allocate arrays. */
1123 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1124 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1125 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1126 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1127 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1128 || dfa->eclosures == NULL, 0))
1131 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1132 if (dfa->subexp_map != NULL)
1135 for (i = 0; i < preg->re_nsub; i++)
1136 dfa->subexp_map[i] = i;
1137 preorder (dfa->str_tree, optimize_subexps, dfa);
1138 for (i = 0; i < preg->re_nsub; i++)
1139 if (dfa->subexp_map[i] != i)
1141 if (i == preg->re_nsub)
1143 free (dfa->subexp_map);
1144 dfa->subexp_map = NULL;
1148 ret = postorder (dfa->str_tree, lower_subexps, preg);
1149 if (BE (ret != REG_NOERROR, 0))
1151 ret = postorder (dfa->str_tree, calc_first, dfa);
1152 if (BE (ret != REG_NOERROR, 0))
1154 preorder (dfa->str_tree, calc_next, dfa);
1155 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1156 if (BE (ret != REG_NOERROR, 0))
1158 ret = calc_eclosure (dfa);
1159 if (BE (ret != REG_NOERROR, 0))
1162 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1163 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1164 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1167 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1168 if (BE (dfa->inveclosures == NULL, 0))
1170 ret = calc_inveclosure (dfa);
1176 /* Our parse trees are very unbalanced, so we cannot use a stack to
1177 implement parse tree visits. Instead, we use parent pointers and
1178 some hairy code in these two functions. */
1179 static reg_errcode_t
1180 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1183 bin_tree_t *node, *prev;
1185 for (node = root; ; )
1187 /* Descend down the tree, preferably to the left (or to the right
1188 if that's the only child). */
1189 while (node->left || node->right)
1197 reg_errcode_t err = fn (extra, node);
1198 if (BE (err != REG_NOERROR, 0))
1200 if (node->parent == NULL)
1203 node = node->parent;
1205 /* Go up while we have a node that is reached from the right. */
1206 while (node->right == prev || node->right == NULL);
1211 static reg_errcode_t
1212 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217 for (node = root; ; )
1219 reg_errcode_t err = fn (extra, node);
1220 if (BE (err != REG_NOERROR, 0))
1223 /* Go to the left node, or up and to the right. */
1228 bin_tree_t *prev = NULL;
1229 while (node->right == prev || node->right == NULL)
1232 node = node->parent;
1241 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1242 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1243 backreferences as well. Requires a preorder visit. */
1244 static reg_errcode_t
1245 optimize_subexps (void *extra, bin_tree_t *node)
1247 re_dfa_t *dfa = (re_dfa_t *) extra;
1249 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1251 int idx = node->token.opr.idx;
1252 node->token.opr.idx = dfa->subexp_map[idx];
1253 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1256 else if (node->token.type == SUBEXP
1257 && node->left && node->left->token.type == SUBEXP)
1259 Idx other_idx = node->left->token.opr.idx;
1261 node->left = node->left->left;
1263 node->left->parent = node;
1265 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1266 if (other_idx < BITSET_WORD_BITS)
1267 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1273 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1274 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1275 static reg_errcode_t
1276 lower_subexps (void *extra, bin_tree_t *node)
1278 regex_t *preg = (regex_t *) extra;
1279 reg_errcode_t err = REG_NOERROR;
1281 if (node->left && node->left->token.type == SUBEXP)
1283 node->left = lower_subexp (&err, preg, node->left);
1285 node->left->parent = node;
1287 if (node->right && node->right->token.type == SUBEXP)
1289 node->right = lower_subexp (&err, preg, node->right);
1291 node->right->parent = node;
1298 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1300 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1301 bin_tree_t *body = node->left;
1302 bin_tree_t *op, *cls, *tree1, *tree;
1305 /* We do not optimize empty subexpressions, because otherwise we may
1306 have bad CONCAT nodes with NULL children. This is obviously not
1307 very common, so we do not lose much. An example that triggers
1308 this case is the sed "script" /\(\)/x. */
1309 && node->left != NULL
1310 && (node->token.opr.idx >= BITSET_WORD_BITS
1311 || !(dfa->used_bkref_map
1312 & ((bitset_word_t) 1 << node->token.opr.idx))))
1315 /* Convert the SUBEXP node to the concatenation of an
1316 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1317 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1318 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1319 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1320 tree = create_tree (dfa, op, tree1, CONCAT);
1321 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1327 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1328 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1332 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1333 nodes. Requires a postorder visit. */
1334 static reg_errcode_t
1335 calc_first (void *extra, bin_tree_t *node)
1337 re_dfa_t *dfa = (re_dfa_t *) extra;
1338 if (node->token.type == CONCAT)
1340 node->first = node->left->first;
1341 node->node_idx = node->left->node_idx;
1346 node->node_idx = re_dfa_add_node (dfa, node->token);
1347 if (BE (node->node_idx == REG_MISSING, 0))
1353 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1354 static reg_errcode_t
1355 calc_next (void *extra, bin_tree_t *node)
1357 switch (node->token.type)
1359 case OP_DUP_ASTERISK:
1360 node->left->next = node;
1363 node->left->next = node->right->first;
1364 node->right->next = node->next;
1368 node->left->next = node->next;
1370 node->right->next = node->next;
1376 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1377 static reg_errcode_t
1378 link_nfa_nodes (void *extra, bin_tree_t *node)
1380 re_dfa_t *dfa = (re_dfa_t *) extra;
1381 Idx idx = node->node_idx;
1382 reg_errcode_t err = REG_NOERROR;
1384 switch (node->token.type)
1390 assert (node->next == NULL);
1393 case OP_DUP_ASTERISK:
1397 dfa->has_plural_match = 1;
1398 if (node->left != NULL)
1399 left = node->left->first->node_idx;
1401 left = node->next->node_idx;
1402 if (node->right != NULL)
1403 right = node->right->first->node_idx;
1405 right = node->next->node_idx;
1406 assert (REG_VALID_INDEX (left));
1407 assert (REG_VALID_INDEX (right));
1408 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1413 case OP_OPEN_SUBEXP:
1414 case OP_CLOSE_SUBEXP:
1415 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1419 dfa->nexts[idx] = node->next->node_idx;
1420 if (node->token.type == OP_BACK_REF)
1421 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1425 assert (!IS_EPSILON_NODE (node->token.type));
1426 dfa->nexts[idx] = node->next->node_idx;
1433 /* Duplicate the epsilon closure of the node ROOT_NODE.
1434 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1435 to their own constraint. */
1437 static reg_errcode_t
1439 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1440 Idx root_node, unsigned int init_constraint)
1442 Idx org_node, clone_node;
1444 unsigned int constraint = init_constraint;
1445 for (org_node = top_org_node, clone_node = top_clone_node;;)
1447 Idx org_dest, clone_dest;
1448 if (dfa->nodes[org_node].type == OP_BACK_REF)
1450 /* If the back reference epsilon-transit, its destination must
1451 also have the constraint. Then duplicate the epsilon closure
1452 of the destination of the back reference, and store it in
1453 edests of the back reference. */
1454 org_dest = dfa->nexts[org_node];
1455 re_node_set_empty (dfa->edests + clone_node);
1456 clone_dest = duplicate_node (dfa, org_dest, constraint);
1457 if (BE (clone_dest == REG_MISSING, 0))
1459 dfa->nexts[clone_node] = dfa->nexts[org_node];
1460 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1464 else if (dfa->edests[org_node].nelem == 0)
1466 /* In case of the node can't epsilon-transit, don't duplicate the
1467 destination and store the original destination as the
1468 destination of the node. */
1469 dfa->nexts[clone_node] = dfa->nexts[org_node];
1472 else if (dfa->edests[org_node].nelem == 1)
1474 /* In case of the node can epsilon-transit, and it has only one
1476 org_dest = dfa->edests[org_node].elems[0];
1477 re_node_set_empty (dfa->edests + clone_node);
1478 if (dfa->nodes[org_node].type == ANCHOR)
1480 /* In case of the node has another constraint, append it. */
1481 if (org_node == root_node && clone_node != org_node)
1483 /* ...but if the node is root_node itself, it means the
1484 epsilon closure have a loop, then tie it to the
1485 destination of the root_node. */
1486 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1491 constraint |= dfa->nodes[org_node].opr.ctx_type;
1493 clone_dest = duplicate_node (dfa, org_dest, constraint);
1494 if (BE (clone_dest == REG_MISSING, 0))
1496 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500 else /* dfa->edests[org_node].nelem == 2 */
1502 /* In case of the node can epsilon-transit, and it has two
1503 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1504 org_dest = dfa->edests[org_node].elems[0];
1505 re_node_set_empty (dfa->edests + clone_node);
1506 /* Search for a duplicated node which satisfies the constraint. */
1507 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1508 if (clone_dest == REG_MISSING)
1510 /* There are no such a duplicated node, create a new one. */
1512 clone_dest = duplicate_node (dfa, org_dest, constraint);
1513 if (BE (clone_dest == REG_MISSING, 0))
1515 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1518 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1519 root_node, constraint);
1520 if (BE (err != REG_NOERROR, 0))
1525 /* There are a duplicated node which satisfy the constraint,
1526 use it to avoid infinite loop. */
1527 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1532 org_dest = dfa->edests[org_node].elems[1];
1533 clone_dest = duplicate_node (dfa, org_dest, constraint);
1534 if (BE (clone_dest == REG_MISSING, 0))
1536 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540 org_node = org_dest;
1541 clone_node = clone_dest;
1546 /* Search for a node which is duplicated from the node ORG_NODE, and
1547 satisfies the constraint CONSTRAINT. */
1550 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1551 unsigned int constraint)
1554 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1556 if (org_node == dfa->org_indices[idx]
1557 && constraint == dfa->nodes[idx].constraint)
1558 return idx; /* Found. */
1560 return REG_MISSING; /* Not found. */
1563 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1564 Return the index of the new node, or REG_MISSING if insufficient storage is
1568 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1570 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1571 if (BE (dup_idx != REG_MISSING, 1))
1573 dfa->nodes[dup_idx].constraint = constraint;
1574 if (dfa->nodes[org_idx].type == ANCHOR)
1575 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1576 dfa->nodes[dup_idx].duplicated = 1;
1578 /* Store the index of the original node. */
1579 dfa->org_indices[dup_idx] = org_idx;
1584 static reg_errcode_t
1585 calc_inveclosure (re_dfa_t *dfa)
1589 for (idx = 0; idx < dfa->nodes_len; ++idx)
1590 re_node_set_init_empty (dfa->inveclosures + idx);
1592 for (src = 0; src < dfa->nodes_len; ++src)
1594 Idx *elems = dfa->eclosures[src].elems;
1595 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1597 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1606 /* Calculate "eclosure" for all the node in DFA. */
1608 static reg_errcode_t
1609 calc_eclosure (re_dfa_t *dfa)
1614 assert (dfa->nodes_len > 0);
1617 /* For each nodes, calculate epsilon closure. */
1618 for (node_idx = 0; ; ++node_idx)
1621 re_node_set eclosure_elem;
1622 if (node_idx == dfa->nodes_len)
1631 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1634 /* If we have already calculated, skip it. */
1635 if (dfa->eclosures[node_idx].nelem != 0)
1637 /* Calculate epsilon closure of `node_idx'. */
1638 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1639 if (BE (err != REG_NOERROR, 0))
1642 if (dfa->eclosures[node_idx].nelem == 0)
1645 re_node_set_free (&eclosure_elem);
1651 /* Calculate epsilon closure of NODE. */
1653 static reg_errcode_t
1654 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1657 unsigned int constraint;
1661 re_node_set eclosure;
1663 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1664 if (BE (err != REG_NOERROR, 0))
1667 /* This indicates that we are calculating this node now.
1668 We reference this value to avoid infinite loop. */
1669 dfa->eclosures[node].nelem = REG_MISSING;
1671 constraint = ((dfa->nodes[node].type == ANCHOR)
1672 ? dfa->nodes[node].opr.ctx_type : 0);
1673 /* If the current node has constraints, duplicate all nodes.
1674 Since they must inherit the constraints. */
1676 && dfa->edests[node].nelem
1677 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1679 err = duplicate_node_closure (dfa, node, node, node, constraint);
1680 if (BE (err != REG_NOERROR, 0))
1684 /* Expand each epsilon destination nodes. */
1685 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686 for (i = 0; i < dfa->edests[node].nelem; ++i)
1688 re_node_set eclosure_elem;
1689 Idx edest = dfa->edests[node].elems[i];
1690 /* If calculating the epsilon closure of `edest' is in progress,
1691 return intermediate result. */
1692 if (dfa->eclosures[edest].nelem == REG_MISSING)
1697 /* If we haven't calculated the epsilon closure of `edest' yet,
1698 calculate now. Otherwise use calculated epsilon closure. */
1699 if (dfa->eclosures[edest].nelem == 0)
1701 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702 if (BE (err != REG_NOERROR, 0))
1706 eclosure_elem = dfa->eclosures[edest];
1707 /* Merge the epsilon closure of `edest'. */
1708 re_node_set_merge (&eclosure, &eclosure_elem);
1709 /* If the epsilon closure of `edest' is incomplete,
1710 the epsilon closure of this node is also incomplete. */
1711 if (dfa->eclosures[edest].nelem == 0)
1714 re_node_set_free (&eclosure_elem);
1718 /* Epsilon closures include itself. */
1719 ok = re_node_set_insert (&eclosure, node);
1722 if (incomplete && !root)
1723 dfa->eclosures[node].nelem = 0;
1725 dfa->eclosures[node] = eclosure;
1726 *new_set = eclosure;
1730 /* Functions for token which are used in the parser. */
1732 /* Fetch a token from INPUT.
1733 We must not use this function inside bracket expressions. */
1737 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1739 re_string_skip_bytes (input, peek_token (result, input, syntax));
1742 /* Peek a token from INPUT, and return the length of the token.
1743 We must not use this function inside bracket expressions. */
1747 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1751 if (re_string_eoi (input))
1753 token->type = END_OF_RE;
1757 c = re_string_peek_byte (input, 0);
1760 token->word_char = 0;
1761 #ifdef RE_ENABLE_I18N
1762 token->mb_partial = 0;
1763 if (input->mb_cur_max > 1 &&
1764 !re_string_first_byte (input, re_string_cur_idx (input)))
1766 token->type = CHARACTER;
1767 token->mb_partial = 1;
1774 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1776 token->type = BACK_SLASH;
1780 c2 = re_string_peek_byte_case (input, 1);
1782 token->type = CHARACTER;
1783 #ifdef RE_ENABLE_I18N
1784 if (input->mb_cur_max > 1)
1786 wint_t wc = re_string_wchar_at (input,
1787 re_string_cur_idx (input) + 1);
1788 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1792 token->word_char = IS_WORD_CHAR (c2) != 0;
1797 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1798 token->type = OP_ALT;
1800 case '1': case '2': case '3': case '4': case '5':
1801 case '6': case '7': case '8': case '9':
1802 if (!(syntax & RE_NO_BK_REFS))
1804 token->type = OP_BACK_REF;
1805 token->opr.idx = c2 - '1';
1809 if (!(syntax & RE_NO_GNU_OPS))
1811 token->type = ANCHOR;
1812 token->opr.ctx_type = WORD_FIRST;
1816 if (!(syntax & RE_NO_GNU_OPS))
1818 token->type = ANCHOR;
1819 token->opr.ctx_type = WORD_LAST;
1823 if (!(syntax & RE_NO_GNU_OPS))
1825 token->type = ANCHOR;
1826 token->opr.ctx_type = WORD_DELIM;
1830 if (!(syntax & RE_NO_GNU_OPS))
1832 token->type = ANCHOR;
1833 token->opr.ctx_type = NOT_WORD_DELIM;
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = OP_WORD;
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = OP_NOTWORD;
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = OP_SPACE;
1849 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = OP_NOTSPACE;
1853 if (!(syntax & RE_NO_GNU_OPS))
1855 token->type = ANCHOR;
1856 token->opr.ctx_type = BUF_FIRST;
1860 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = ANCHOR;
1863 token->opr.ctx_type = BUF_LAST;
1867 if (!(syntax & RE_NO_BK_PARENS))
1868 token->type = OP_OPEN_SUBEXP;
1871 if (!(syntax & RE_NO_BK_PARENS))
1872 token->type = OP_CLOSE_SUBEXP;
1875 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1876 token->type = OP_DUP_PLUS;
1879 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1880 token->type = OP_DUP_QUESTION;
1883 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1884 token->type = OP_OPEN_DUP_NUM;
1887 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1888 token->type = OP_CLOSE_DUP_NUM;
1896 token->type = CHARACTER;
1897 #ifdef RE_ENABLE_I18N
1898 if (input->mb_cur_max > 1)
1900 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1901 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1905 token->word_char = IS_WORD_CHAR (token->opr.c);
1910 if (syntax & RE_NEWLINE_ALT)
1911 token->type = OP_ALT;
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1915 token->type = OP_ALT;
1918 token->type = OP_DUP_ASTERISK;
1921 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1922 token->type = OP_DUP_PLUS;
1925 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1926 token->type = OP_DUP_QUESTION;
1929 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1930 token->type = OP_OPEN_DUP_NUM;
1933 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1934 token->type = OP_CLOSE_DUP_NUM;
1937 if (syntax & RE_NO_BK_PARENS)
1938 token->type = OP_OPEN_SUBEXP;
1941 if (syntax & RE_NO_BK_PARENS)
1942 token->type = OP_CLOSE_SUBEXP;
1945 token->type = OP_OPEN_BRACKET;
1948 token->type = OP_PERIOD;
1951 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1952 re_string_cur_idx (input) != 0)
1954 char prev = re_string_peek_byte (input, -1);
1955 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1958 token->type = ANCHOR;
1959 token->opr.ctx_type = LINE_FIRST;
1962 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1963 re_string_cur_idx (input) + 1 != re_string_length (input))
1966 re_string_skip_bytes (input, 1);
1967 peek_token (&next, input, syntax);
1968 re_string_skip_bytes (input, -1);
1969 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1972 token->type = ANCHOR;
1973 token->opr.ctx_type = LINE_LAST;
1981 /* Peek a token from INPUT, and return the length of the token.
1982 We must not use this function out of bracket expressions. */
1986 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1989 if (re_string_eoi (input))
1991 token->type = END_OF_RE;
1994 c = re_string_peek_byte (input, 0);
1997 #ifdef RE_ENABLE_I18N
1998 if (input->mb_cur_max > 1 &&
1999 !re_string_first_byte (input, re_string_cur_idx (input)))
2001 token->type = CHARACTER;
2004 #endif /* RE_ENABLE_I18N */
2006 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2007 && re_string_cur_idx (input) + 1 < re_string_length (input))
2009 /* In this case, '\' escape a character. */
2011 re_string_skip_bytes (input, 1);
2012 c2 = re_string_peek_byte (input, 0);
2014 token->type = CHARACTER;
2017 if (c == '[') /* '[' is a special char in a bracket exps. */
2021 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2022 c2 = re_string_peek_byte (input, 1);
2030 token->type = OP_OPEN_COLL_ELEM;
2033 token->type = OP_OPEN_EQUIV_CLASS;
2036 if (syntax & RE_CHAR_CLASSES)
2038 token->type = OP_OPEN_CHAR_CLASS;
2041 /* else fall through. */
2043 token->type = CHARACTER;
2053 token->type = OP_CHARSET_RANGE;
2056 token->type = OP_CLOSE_BRACKET;
2059 token->type = OP_NON_MATCH_LIST;
2062 token->type = CHARACTER;
2067 /* Functions for parser. */
2069 /* Entry point of the parser.
2070 Parse the regular expression REGEXP and return the structure tree.
2071 If an error is occured, ERR is set by error code, and return NULL.
2072 This function build the following tree, from regular expression <reg_exp>:
2078 CAT means concatenation.
2079 EOR means end of regular expression. */
2082 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2085 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2086 bin_tree_t *tree, *eor, *root;
2087 re_token_t current_token;
2088 dfa->syntax = syntax;
2089 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2090 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2091 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2093 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2095 root = create_tree (dfa, tree, eor, CONCAT);
2098 if (BE (eor == NULL || root == NULL, 0))
2106 /* This function build the following tree, from regular expression
2107 <branch1>|<branch2>:
2113 ALT means alternative, which represents the operator `|'. */
2116 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2117 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120 bin_tree_t *tree, *branch = NULL;
2121 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2122 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2125 while (token->type == OP_ALT)
2127 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2128 if (token->type != OP_ALT && token->type != END_OF_RE
2129 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2131 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2132 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2137 tree = create_tree (dfa, tree, branch, OP_ALT);
2138 if (BE (tree == NULL, 0))
2147 /* This function build the following tree, from regular expression
2154 CAT means concatenation. */
2157 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2158 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2160 bin_tree_t *tree, *expr;
2161 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2162 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2163 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2166 while (token->type != OP_ALT && token->type != END_OF_RE
2167 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2169 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2170 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2174 if (tree != NULL && expr != NULL)
2176 tree = create_tree (dfa, tree, expr, CONCAT);
2183 else if (tree == NULL)
2185 /* Otherwise expr == NULL, we don't need to create new tree. */
2190 /* This function build the following tree, from regular expression a*:
2197 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2198 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2200 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2202 switch (token->type)
2205 tree = create_token_tree (dfa, NULL, NULL, token);
2206 if (BE (tree == NULL, 0))
2211 #ifdef RE_ENABLE_I18N
2212 if (dfa->mb_cur_max > 1)
2214 while (!re_string_eoi (regexp)
2215 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2217 bin_tree_t *mbc_remain;
2218 fetch_token (token, regexp, syntax);
2219 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2220 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2221 if (BE (mbc_remain == NULL || tree == NULL, 0))
2230 case OP_OPEN_SUBEXP:
2231 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2232 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2235 case OP_OPEN_BRACKET:
2236 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2237 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2241 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2246 dfa->used_bkref_map |= 1 << token->opr.idx;
2247 tree = create_token_tree (dfa, NULL, NULL, token);
2248 if (BE (tree == NULL, 0))
2254 dfa->has_mb_node = 1;
2256 case OP_OPEN_DUP_NUM:
2257 if (syntax & RE_CONTEXT_INVALID_DUP)
2263 case OP_DUP_ASTERISK:
2265 case OP_DUP_QUESTION:
2266 if (syntax & RE_CONTEXT_INVALID_OPS)
2271 else if (syntax & RE_CONTEXT_INDEP_OPS)
2273 fetch_token (token, regexp, syntax);
2274 return parse_expression (regexp, preg, token, syntax, nest, err);
2276 /* else fall through */
2277 case OP_CLOSE_SUBEXP:
2278 if ((token->type == OP_CLOSE_SUBEXP) &&
2279 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2284 /* else fall through */
2285 case OP_CLOSE_DUP_NUM:
2286 /* We treat it as a normal character. */
2288 /* Then we can these characters as normal characters. */
2289 token->type = CHARACTER;
2290 /* mb_partial and word_char bits should be initialized already
2292 tree = create_token_tree (dfa, NULL, NULL, token);
2293 if (BE (tree == NULL, 0))
2300 if ((token->opr.ctx_type
2301 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2302 && dfa->word_ops_used == 0)
2303 init_word_char (dfa);
2304 if (token->opr.ctx_type == WORD_DELIM
2305 || token->opr.ctx_type == NOT_WORD_DELIM)
2307 bin_tree_t *tree_first, *tree_last;
2308 if (token->opr.ctx_type == WORD_DELIM)
2310 token->opr.ctx_type = WORD_FIRST;
2311 tree_first = create_token_tree (dfa, NULL, NULL, token);
2312 token->opr.ctx_type = WORD_LAST;
2316 token->opr.ctx_type = INSIDE_WORD;
2317 tree_first = create_token_tree (dfa, NULL, NULL, token);
2318 token->opr.ctx_type = INSIDE_NOTWORD;
2320 tree_last = create_token_tree (dfa, NULL, NULL, token);
2321 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2322 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2330 tree = create_token_tree (dfa, NULL, NULL, token);
2331 if (BE (tree == NULL, 0))
2337 /* We must return here, since ANCHORs can't be followed
2338 by repetition operators.
2339 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2340 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2341 fetch_token (token, regexp, syntax);
2344 tree = create_token_tree (dfa, NULL, NULL, token);
2345 if (BE (tree == NULL, 0))
2350 if (dfa->mb_cur_max > 1)
2351 dfa->has_mb_node = 1;
2355 tree = build_charclass_op (dfa, regexp->trans,
2356 (const unsigned char *) "alnum",
2357 (const unsigned char *) "_",
2358 token->type == OP_NOTWORD, err);
2359 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2364 tree = build_charclass_op (dfa, regexp->trans,
2365 (const unsigned char *) "space",
2366 (const unsigned char *) "",
2367 token->type == OP_NOTSPACE, err);
2368 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2378 /* Must not happen? */
2384 fetch_token (token, regexp, syntax);
2386 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2387 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2389 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2390 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392 /* In BRE consecutive duplications are not allowed. */
2393 if ((syntax & RE_CONTEXT_INVALID_DUP)
2394 && (token->type == OP_DUP_ASTERISK
2395 || token->type == OP_OPEN_DUP_NUM))
2405 /* This function build the following tree, from regular expression
2413 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2414 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2416 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2419 cur_nsub = preg->re_nsub++;
2421 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2423 /* The subexpression may be a null string. */
2424 if (token->type == OP_CLOSE_SUBEXP)
2428 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2429 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2431 if (BE (*err != REG_NOERROR, 0))
2435 if (cur_nsub <= '9' - '1')
2436 dfa->completed_bkref_map |= 1 << cur_nsub;
2438 tree = create_tree (dfa, tree, NULL, SUBEXP);
2439 if (BE (tree == NULL, 0))
2444 tree->token.opr.idx = cur_nsub;
2448 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2451 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2452 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2454 bin_tree_t *tree = NULL, *old_tree = NULL;
2455 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2456 re_token_t start_token = *token;
2458 if (token->type == OP_OPEN_DUP_NUM)
2461 start = fetch_number (regexp, token, syntax);
2462 if (start == REG_MISSING)
2464 if (token->type == CHARACTER && token->opr.c == ',')
2465 start = 0; /* We treat "{,m}" as "{0,m}". */
2468 *err = REG_BADBR; /* <re>{} is invalid. */
2472 if (BE (start != REG_ERROR, 1))
2474 /* We treat "{n}" as "{n,n}". */
2475 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2476 : ((token->type == CHARACTER && token->opr.c == ',')
2477 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2479 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2481 /* Invalid sequence. */
2482 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2484 if (token->type == END_OF_RE)
2492 /* If the syntax bit is set, rollback. */
2493 re_string_set_index (regexp, start_idx);
2494 *token = start_token;
2495 token->type = CHARACTER;
2496 /* mb_partial and word_char bits should be already initialized by
2501 if (BE (end != REG_MISSING && start > end, 0))
2503 /* First number greater than second. */
2510 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2511 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2514 fetch_token (token, regexp, syntax);
2516 if (BE (elem == NULL, 0))
2518 if (BE (start == 0 && end == 0, 0))
2520 postorder (elem, free_tree, NULL);
2524 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2525 if (BE (start > 0, 0))
2528 for (i = 2; i <= start; ++i)
2530 elem = duplicate_tree (elem, dfa);
2531 tree = create_tree (dfa, tree, elem, CONCAT);
2532 if (BE (elem == NULL || tree == NULL, 0))
2533 goto parse_dup_op_espace;
2539 /* Duplicate ELEM before it is marked optional. */
2540 elem = duplicate_tree (elem, dfa);
2546 if (elem->token.type == SUBEXP)
2547 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2549 tree = create_tree (dfa, elem, NULL,
2550 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2551 if (BE (tree == NULL, 0))
2552 goto parse_dup_op_espace;
2554 /* This loop is actually executed only when end != REG_MISSING,
2555 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2556 already created the start+1-th copy. */
2557 if ((Idx) -1 < 0 || end != REG_MISSING)
2558 for (i = start + 2; i <= end; ++i)
2560 elem = duplicate_tree (elem, dfa);
2561 tree = create_tree (dfa, tree, elem, CONCAT);
2562 if (BE (elem == NULL || tree == NULL, 0))
2563 goto parse_dup_op_espace;
2565 tree = create_tree (dfa, tree, NULL, OP_ALT);
2566 if (BE (tree == NULL, 0))
2567 goto parse_dup_op_espace;
2571 tree = create_tree (dfa, old_tree, tree, CONCAT);
2575 parse_dup_op_espace:
2580 /* Size of the names for collating symbol/equivalence_class/character_class.
2581 I'm not sure, but maybe enough. */
2582 #define BRACKET_NAME_BUF_SIZE 32
2585 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2586 Build the range expression which starts from START_ELEM, and ends
2587 at END_ELEM. The result are written to MBCSET and SBCSET.
2588 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2589 mbcset->range_ends, is a pointer argument sinse we may
2592 static reg_errcode_t
2594 # ifdef RE_ENABLE_I18N
2595 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2596 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2597 # else /* not RE_ENABLE_I18N */
2598 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2599 bracket_elem_t *end_elem)
2600 # endif /* not RE_ENABLE_I18N */
2602 unsigned int start_ch, end_ch;
2603 /* Equivalence Classes and Character Classes can't be a range start/end. */
2604 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2605 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2609 /* We can handle no multi character collating elements without libc
2611 if (BE ((start_elem->type == COLL_SYM
2612 && strlen ((char *) start_elem->opr.name) > 1)
2613 || (end_elem->type == COLL_SYM
2614 && strlen ((char *) end_elem->opr.name) > 1), 0))
2615 return REG_ECOLLATE;
2617 # ifdef RE_ENABLE_I18N
2622 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2624 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2625 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2627 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2628 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2630 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2631 ? __btowc (start_ch) : start_elem->opr.wch);
2632 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2633 ? __btowc (end_ch) : end_elem->opr.wch);
2634 if (start_wc == WEOF || end_wc == WEOF)
2635 return REG_ECOLLATE;
2636 cmp_buf[0] = start_wc;
2637 cmp_buf[4] = end_wc;
2638 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2641 /* Got valid collation sequence values, add them as a new entry.
2642 However, for !_LIBC we have no collation elements: if the
2643 character set is single byte, the single byte character set
2644 that we build below suffices. parse_bracket_exp passes
2645 no MBCSET if dfa->mb_cur_max == 1. */
2648 /* Check the space of the arrays. */
2649 if (BE (*range_alloc == mbcset->nranges, 0))
2651 /* There is not enough space, need realloc. */
2652 wchar_t *new_array_start, *new_array_end;
2655 /* +1 in case of mbcset->nranges is 0. */
2656 new_nranges = 2 * mbcset->nranges + 1;
2657 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2658 are NULL if *range_alloc == 0. */
2659 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2661 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2664 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2667 mbcset->range_starts = new_array_start;
2668 mbcset->range_ends = new_array_end;
2669 *range_alloc = new_nranges;
2672 mbcset->range_starts[mbcset->nranges] = start_wc;
2673 mbcset->range_ends[mbcset->nranges++] = end_wc;
2676 /* Build the table for single byte characters. */
2677 for (wc = 0; wc < SBC_MAX; ++wc)
2680 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2681 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2682 bitset_set (sbcset, wc);
2685 # else /* not RE_ENABLE_I18N */
2688 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2689 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2691 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2692 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2694 if (start_ch > end_ch)
2696 /* Build the table for single byte characters. */
2697 for (ch = 0; ch < SBC_MAX; ++ch)
2698 if (start_ch <= ch && ch <= end_ch)
2699 bitset_set (sbcset, ch);
2701 # endif /* not RE_ENABLE_I18N */
2704 #endif /* not _LIBC */
2707 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2708 Build the collating element which is represented by NAME.
2709 The result are written to MBCSET and SBCSET.
2710 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2711 pointer argument since we may update it. */
2713 static reg_errcode_t
2715 build_collating_symbol (bitset_t sbcset,
2716 # ifdef RE_ENABLE_I18N
2717 re_charset_t *mbcset, Idx *coll_sym_alloc,
2719 const unsigned char *name)
2721 size_t name_len = strlen ((const char *) name);
2722 if (BE (name_len != 1, 0))
2723 return REG_ECOLLATE;
2726 bitset_set (sbcset, name[0]);
2730 #endif /* not _LIBC */
2732 /* This function parse bracket expression like "[abc]", "[a-c]",
2736 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2737 reg_syntax_t syntax, reg_errcode_t *err)
2740 const unsigned char *collseqmb;
2741 const char *collseqwc;
2744 const int32_t *symb_table;
2745 const unsigned char *extra;
2747 /* Local function for parse_bracket_exp used in _LIBC environement.
2748 Seek the collating symbol entry correspondings to NAME.
2749 Return the index of the symbol in the SYMB_TABLE. */
2752 __attribute ((always_inline))
2753 seek_collating_symbol_entry (name, name_len)
2754 const unsigned char *name;
2757 int32_t hash = elem_hash ((const char *) name, name_len);
2758 int32_t elem = hash % table_size;
2759 if (symb_table[2 * elem] != 0)
2761 int32_t second = hash % (table_size - 2) + 1;
2765 /* First compare the hashing value. */
2766 if (symb_table[2 * elem] == hash
2767 /* Compare the length of the name. */
2768 && name_len == extra[symb_table[2 * elem + 1]]
2769 /* Compare the name. */
2770 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2773 /* Yep, this is the entry. */
2780 while (symb_table[2 * elem] != 0);
2785 /* Local function for parse_bracket_exp used in _LIBC environement.
2786 Look up the collation sequence value of BR_ELEM.
2787 Return the value if succeeded, UINT_MAX otherwise. */
2789 auto inline unsigned int
2790 __attribute ((always_inline))
2791 lookup_collation_sequence_value (br_elem)
2792 bracket_elem_t *br_elem;
2794 if (br_elem->type == SB_CHAR)
2797 if (MB_CUR_MAX == 1)
2800 return collseqmb[br_elem->opr.ch];
2803 wint_t wc = __btowc (br_elem->opr.ch);
2804 return __collseq_table_lookup (collseqwc, wc);
2807 else if (br_elem->type == MB_CHAR)
2809 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2811 else if (br_elem->type == COLL_SYM)
2813 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2817 elem = seek_collating_symbol_entry (br_elem->opr.name,
2819 if (symb_table[2 * elem] != 0)
2821 /* We found the entry. */
2822 idx = symb_table[2 * elem + 1];
2823 /* Skip the name of collating element name. */
2824 idx += 1 + extra[idx];
2825 /* Skip the byte sequence of the collating element. */
2826 idx += 1 + extra[idx];
2827 /* Adjust for the alignment. */
2828 idx = (idx + 3) & ~3;
2829 /* Skip the multibyte collation sequence value. */
2830 idx += sizeof (unsigned int);
2831 /* Skip the wide char sequence of the collating element. */
2832 idx += sizeof (unsigned int) *
2833 (1 + *(unsigned int *) (extra + idx));
2834 /* Return the collation sequence value. */
2835 return *(unsigned int *) (extra + idx);
2837 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2839 /* No valid character. Match it as a single byte
2841 return collseqmb[br_elem->opr.name[0]];
2844 else if (sym_name_len == 1)
2845 return collseqmb[br_elem->opr.name[0]];
2850 /* Local function for parse_bracket_exp used in _LIBC environement.
2851 Build the range expression which starts from START_ELEM, and ends
2852 at END_ELEM. The result are written to MBCSET and SBCSET.
2853 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2854 mbcset->range_ends, is a pointer argument sinse we may
2857 auto inline reg_errcode_t
2858 __attribute ((always_inline))
2859 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2860 re_charset_t *mbcset;
2863 bracket_elem_t *start_elem, *end_elem;
2866 uint32_t start_collseq;
2867 uint32_t end_collseq;
2869 /* Equivalence Classes and Character Classes can't be a range
2871 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2872 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2876 start_collseq = lookup_collation_sequence_value (start_elem);
2877 end_collseq = lookup_collation_sequence_value (end_elem);
2878 /* Check start/end collation sequence values. */
2879 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2880 return REG_ECOLLATE;
2881 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2884 /* Got valid collation sequence values, add them as a new entry.
2885 However, if we have no collation elements, and the character set
2886 is single byte, the single byte character set that we
2887 build below suffices. */
2888 if (nrules > 0 || dfa->mb_cur_max > 1)
2890 /* Check the space of the arrays. */
2891 if (BE (*range_alloc == mbcset->nranges, 0))
2893 /* There is not enough space, need realloc. */
2894 uint32_t *new_array_start;
2895 uint32_t *new_array_end;
2898 /* +1 in case of mbcset->nranges is 0. */
2899 new_nranges = 2 * mbcset->nranges + 1;
2900 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2902 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2905 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2908 mbcset->range_starts = new_array_start;
2909 mbcset->range_ends = new_array_end;
2910 *range_alloc = new_nranges;
2913 mbcset->range_starts[mbcset->nranges] = start_collseq;
2914 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2917 /* Build the table for single byte characters. */
2918 for (ch = 0; ch < SBC_MAX; ch++)
2920 uint32_t ch_collseq;
2922 if (MB_CUR_MAX == 1)
2925 ch_collseq = collseqmb[ch];
2927 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2928 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2929 bitset_set (sbcset, ch);
2934 /* Local function for parse_bracket_exp used in _LIBC environement.
2935 Build the collating element which is represented by NAME.
2936 The result are written to MBCSET and SBCSET.
2937 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2938 pointer argument sinse we may update it. */
2940 auto inline reg_errcode_t
2941 __attribute ((always_inline))
2942 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2943 re_charset_t *mbcset;
2944 Idx *coll_sym_alloc;
2946 const unsigned char *name;
2949 size_t name_len = strlen ((const char *) name);
2952 elem = seek_collating_symbol_entry (name, name_len);
2953 if (symb_table[2 * elem] != 0)
2955 /* We found the entry. */
2956 idx = symb_table[2 * elem + 1];
2957 /* Skip the name of collating element name. */
2958 idx += 1 + extra[idx];
2960 else if (symb_table[2 * elem] == 0 && name_len == 1)
2962 /* No valid character, treat it as a normal
2964 bitset_set (sbcset, name[0]);
2968 return REG_ECOLLATE;
2970 /* Got valid collation sequence, add it as a new entry. */
2971 /* Check the space of the arrays. */
2972 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2974 /* Not enough, realloc it. */
2975 /* +1 in case of mbcset->ncoll_syms is 0. */
2976 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2977 /* Use realloc since mbcset->coll_syms is NULL
2979 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2980 new_coll_sym_alloc);
2981 if (BE (new_coll_syms == NULL, 0))
2983 mbcset->coll_syms = new_coll_syms;
2984 *coll_sym_alloc = new_coll_sym_alloc;
2986 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2991 if (BE (name_len != 1, 0))
2992 return REG_ECOLLATE;
2995 bitset_set (sbcset, name[0]);
3002 re_token_t br_token;
3003 re_bitset_ptr_t sbcset;
3004 #ifdef RE_ENABLE_I18N
3005 re_charset_t *mbcset;
3006 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3007 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3008 #endif /* not RE_ENABLE_I18N */
3009 bool non_match = false;
3010 bin_tree_t *work_tree;
3012 bool first_round = true;
3014 collseqmb = (const unsigned char *)
3015 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3016 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3022 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3023 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3024 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3025 _NL_COLLATE_SYMB_TABLEMB);
3026 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3027 _NL_COLLATE_SYMB_EXTRAMB);
3030 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3031 #ifdef RE_ENABLE_I18N
3032 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3033 #endif /* RE_ENABLE_I18N */
3034 #ifdef RE_ENABLE_I18N
3035 if (BE (sbcset == NULL || mbcset == NULL, 0))
3037 if (BE (sbcset == NULL, 0))
3038 #endif /* RE_ENABLE_I18N */
3044 token_len = peek_token_bracket (token, regexp, syntax);
3045 if (BE (token->type == END_OF_RE, 0))
3048 goto parse_bracket_exp_free_return;
3050 if (token->type == OP_NON_MATCH_LIST)
3052 #ifdef RE_ENABLE_I18N
3053 mbcset->non_match = 1;
3054 #endif /* not RE_ENABLE_I18N */
3056 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3057 bitset_set (sbcset, '\n');
3058 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3059 token_len = peek_token_bracket (token, regexp, syntax);
3060 if (BE (token->type == END_OF_RE, 0))
3063 goto parse_bracket_exp_free_return;
3067 /* We treat the first ']' as a normal character. */
3068 if (token->type == OP_CLOSE_BRACKET)
3069 token->type = CHARACTER;
3073 bracket_elem_t start_elem, end_elem;
3074 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3075 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3078 bool is_range_exp = false;
3081 start_elem.opr.name = start_name_buf;
3082 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3083 syntax, first_round);
3084 if (BE (ret != REG_NOERROR, 0))
3087 goto parse_bracket_exp_free_return;
3089 first_round = false;
3091 /* Get information about the next token. We need it in any case. */
3092 token_len = peek_token_bracket (token, regexp, syntax);
3094 /* Do not check for ranges if we know they are not allowed. */
3095 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3097 if (BE (token->type == END_OF_RE, 0))
3100 goto parse_bracket_exp_free_return;
3102 if (token->type == OP_CHARSET_RANGE)
3104 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3105 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3106 if (BE (token2.type == END_OF_RE, 0))
3109 goto parse_bracket_exp_free_return;
3111 if (token2.type == OP_CLOSE_BRACKET)
3113 /* We treat the last '-' as a normal character. */
3114 re_string_skip_bytes (regexp, -token_len);
3115 token->type = CHARACTER;
3118 is_range_exp = true;
3122 if (is_range_exp == true)
3124 end_elem.opr.name = end_name_buf;
3125 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3127 if (BE (ret != REG_NOERROR, 0))
3130 goto parse_bracket_exp_free_return;
3133 token_len = peek_token_bracket (token, regexp, syntax);
3136 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3137 &start_elem, &end_elem);
3139 # ifdef RE_ENABLE_I18N
3140 *err = build_range_exp (sbcset,
3141 dfa->mb_cur_max > 1 ? mbcset : NULL,
3142 &range_alloc, &start_elem, &end_elem);
3144 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3146 #endif /* RE_ENABLE_I18N */
3147 if (BE (*err != REG_NOERROR, 0))
3148 goto parse_bracket_exp_free_return;
3152 switch (start_elem.type)
3155 bitset_set (sbcset, start_elem.opr.ch);
3157 #ifdef RE_ENABLE_I18N
3159 /* Check whether the array has enough space. */
3160 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3162 wchar_t *new_mbchars;
3163 /* Not enough, realloc it. */
3164 /* +1 in case of mbcset->nmbchars is 0. */
3165 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3166 /* Use realloc since array is NULL if *alloc == 0. */
3167 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3169 if (BE (new_mbchars == NULL, 0))
3170 goto parse_bracket_exp_espace;
3171 mbcset->mbchars = new_mbchars;
3173 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3175 #endif /* RE_ENABLE_I18N */
3177 *err = build_equiv_class (sbcset,
3178 #ifdef RE_ENABLE_I18N
3179 mbcset, &equiv_class_alloc,
3180 #endif /* RE_ENABLE_I18N */
3181 start_elem.opr.name);
3182 if (BE (*err != REG_NOERROR, 0))
3183 goto parse_bracket_exp_free_return;
3186 *err = build_collating_symbol (sbcset,
3187 #ifdef RE_ENABLE_I18N
3188 mbcset, &coll_sym_alloc,
3189 #endif /* RE_ENABLE_I18N */
3190 start_elem.opr.name);
3191 if (BE (*err != REG_NOERROR, 0))
3192 goto parse_bracket_exp_free_return;
3195 *err = build_charclass (regexp->trans, sbcset,
3196 #ifdef RE_ENABLE_I18N
3197 mbcset, &char_class_alloc,
3198 #endif /* RE_ENABLE_I18N */
3199 start_elem.opr.name, syntax);
3200 if (BE (*err != REG_NOERROR, 0))
3201 goto parse_bracket_exp_free_return;
3208 if (BE (token->type == END_OF_RE, 0))
3211 goto parse_bracket_exp_free_return;
3213 if (token->type == OP_CLOSE_BRACKET)
3217 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3219 /* If it is non-matching list. */
3221 bitset_not (sbcset);
3223 #ifdef RE_ENABLE_I18N
3224 /* Ensure only single byte characters are set. */
3225 if (dfa->mb_cur_max > 1)
3226 bitset_mask (sbcset, dfa->sb_char);
3228 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3229 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3230 || mbcset->non_match)))
3232 bin_tree_t *mbc_tree;
3234 /* Build a tree for complex bracket. */
3235 dfa->has_mb_node = 1;
3236 br_token.type = COMPLEX_BRACKET;
3237 br_token.opr.mbcset = mbcset;
3238 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3239 if (BE (mbc_tree == NULL, 0))
3240 goto parse_bracket_exp_espace;
3241 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3242 if (sbcset[sbc_idx])
3244 /* If there are no bits set in sbcset, there is no point
3245 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3246 if (sbc_idx < BITSET_WORDS)
3248 /* Build a tree for simple bracket. */
3249 br_token.type = SIMPLE_BRACKET;
3250 br_token.opr.sbcset = sbcset;
3251 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3252 if (BE (work_tree == NULL, 0))
3253 goto parse_bracket_exp_espace;
3255 /* Then join them by ALT node. */
3256 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3257 if (BE (work_tree == NULL, 0))
3258 goto parse_bracket_exp_espace;
3263 work_tree = mbc_tree;
3267 #endif /* not RE_ENABLE_I18N */
3269 #ifdef RE_ENABLE_I18N
3270 free_charset (mbcset);
3272 /* Build a tree for simple bracket. */
3273 br_token.type = SIMPLE_BRACKET;
3274 br_token.opr.sbcset = sbcset;
3275 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3276 if (BE (work_tree == NULL, 0))
3277 goto parse_bracket_exp_espace;
3281 parse_bracket_exp_espace:
3283 parse_bracket_exp_free_return:
3285 #ifdef RE_ENABLE_I18N
3286 free_charset (mbcset);
3287 #endif /* RE_ENABLE_I18N */
3291 /* Parse an element in the bracket expression. */
3293 static reg_errcode_t
3294 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3295 re_token_t *token, int token_len, re_dfa_t *dfa,
3296 reg_syntax_t syntax, bool accept_hyphen)
3298 #ifdef RE_ENABLE_I18N
3300 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3301 if (cur_char_size > 1)
3303 elem->type = MB_CHAR;
3304 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3305 re_string_skip_bytes (regexp, cur_char_size);
3308 #endif /* RE_ENABLE_I18N */
3309 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3310 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3311 || token->type == OP_OPEN_EQUIV_CLASS)
3312 return parse_bracket_symbol (elem, regexp, token);
3313 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3315 /* A '-' must only appear as anything but a range indicator before
3316 the closing bracket. Everything else is an error. */
3318 (void) peek_token_bracket (&token2, regexp, syntax);
3319 if (token2.type != OP_CLOSE_BRACKET)
3320 /* The actual error value is not standardized since this whole
3321 case is undefined. But ERANGE makes good sense. */
3324 elem->type = SB_CHAR;
3325 elem->opr.ch = token->opr.c;
3329 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3330 such as [:<character_class>:], [.<collating_element>.], and
3331 [=<equivalent_class>=]. */
3333 static reg_errcode_t
3334 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3337 unsigned char ch, delim = token->opr.c;
3339 if (re_string_eoi(regexp))
3343 if (i >= BRACKET_NAME_BUF_SIZE)
3345 if (token->type == OP_OPEN_CHAR_CLASS)
3346 ch = re_string_fetch_byte_case (regexp);
3348 ch = re_string_fetch_byte (regexp);
3349 if (re_string_eoi(regexp))
3351 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3353 elem->opr.name[i] = ch;
3355 re_string_skip_bytes (regexp, 1);
3356 elem->opr.name[i] = '\0';
3357 switch (token->type)
3359 case OP_OPEN_COLL_ELEM:
3360 elem->type = COLL_SYM;
3362 case OP_OPEN_EQUIV_CLASS:
3363 elem->type = EQUIV_CLASS;
3365 case OP_OPEN_CHAR_CLASS:
3366 elem->type = CHAR_CLASS;
3374 /* Helper function for parse_bracket_exp.
3375 Build the equivalence class which is represented by NAME.
3376 The result are written to MBCSET and SBCSET.
3377 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3378 is a pointer argument sinse we may update it. */
3380 static reg_errcode_t
3381 #ifdef RE_ENABLE_I18N
3382 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3383 Idx *equiv_class_alloc, const unsigned char *name)
3384 #else /* not RE_ENABLE_I18N */
3385 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3386 #endif /* not RE_ENABLE_I18N */
3389 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3392 const int32_t *table, *indirect;
3393 const unsigned char *weights, *extra, *cp;
3394 unsigned char char_buf[2];
3398 /* This #include defines a local function! */
3399 # include <locale/weight.h>
3400 /* Calculate the index for equivalence class. */
3402 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3403 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3404 _NL_COLLATE_WEIGHTMB);
3405 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3406 _NL_COLLATE_EXTRAMB);
3407 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3408 _NL_COLLATE_INDIRECTMB);
3409 idx1 = findidx (&cp);
3410 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3411 /* This isn't a valid character. */
3412 return REG_ECOLLATE;
3414 /* Build single byte matcing table for this equivalence class. */
3415 char_buf[1] = (unsigned char) '\0';
3416 len = weights[idx1];
3417 for (ch = 0; ch < SBC_MAX; ++ch)
3421 idx2 = findidx (&cp);
3426 /* This isn't a valid character. */
3428 if (len == weights[idx2])
3431 while (cnt <= len &&
3432 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3436 bitset_set (sbcset, ch);
3439 /* Check whether the array has enough space. */
3440 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3442 /* Not enough, realloc it. */
3443 /* +1 in case of mbcset->nequiv_classes is 0. */
3444 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3445 /* Use realloc since the array is NULL if *alloc == 0. */
3446 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3448 new_equiv_class_alloc);
3449 if (BE (new_equiv_classes == NULL, 0))
3451 mbcset->equiv_classes = new_equiv_classes;
3452 *equiv_class_alloc = new_equiv_class_alloc;
3454 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3459 if (BE (strlen ((const char *) name) != 1, 0))
3460 return REG_ECOLLATE;
3461 bitset_set (sbcset, *name);
3466 /* Helper function for parse_bracket_exp.
3467 Build the character class which is represented by NAME.
3468 The result are written to MBCSET and SBCSET.
3469 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3470 is a pointer argument sinse we may update it. */
3472 static reg_errcode_t
3473 #ifdef RE_ENABLE_I18N
3474 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3475 re_charset_t *mbcset, Idx *char_class_alloc,
3476 const unsigned char *class_name, reg_syntax_t syntax)
3477 #else /* not RE_ENABLE_I18N */
3478 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3479 const unsigned char *class_name, reg_syntax_t syntax)
3480 #endif /* not RE_ENABLE_I18N */
3483 const char *name = (const char *) class_name;
3485 /* In case of REG_ICASE "upper" and "lower" match the both of
3486 upper and lower cases. */
3487 if ((syntax & RE_ICASE)
3488 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3491 #ifdef RE_ENABLE_I18N
3492 /* Check the space of the arrays. */
3493 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3495 /* Not enough, realloc it. */
3496 /* +1 in case of mbcset->nchar_classes is 0. */
3497 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3498 /* Use realloc since array is NULL if *alloc == 0. */
3499 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3500 new_char_class_alloc);
3501 if (BE (new_char_classes == NULL, 0))
3503 mbcset->char_classes = new_char_classes;
3504 *char_class_alloc = new_char_class_alloc;
3506 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3507 #endif /* RE_ENABLE_I18N */
3509 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3511 if (BE (trans != NULL, 0)) \
3513 for (i = 0; i < SBC_MAX; ++i) \
3514 if (ctype_func (i)) \
3515 bitset_set (sbcset, trans[i]); \
3519 for (i = 0; i < SBC_MAX; ++i) \
3520 if (ctype_func (i)) \
3521 bitset_set (sbcset, i); \
3525 if (strcmp (name, "alnum") == 0)
3526 BUILD_CHARCLASS_LOOP (isalnum);
3527 else if (strcmp (name, "cntrl") == 0)
3528 BUILD_CHARCLASS_LOOP (iscntrl);
3529 else if (strcmp (name, "lower") == 0)
3530 BUILD_CHARCLASS_LOOP (islower);
3531 else if (strcmp (name, "space") == 0)
3532 BUILD_CHARCLASS_LOOP (isspace);
3533 else if (strcmp (name, "alpha") == 0)
3534 BUILD_CHARCLASS_LOOP (isalpha);
3535 else if (strcmp (name, "digit") == 0)
3536 BUILD_CHARCLASS_LOOP (isdigit);
3537 else if (strcmp (name, "print") == 0)
3538 BUILD_CHARCLASS_LOOP (isprint);
3539 else if (strcmp (name, "upper") == 0)
3540 BUILD_CHARCLASS_LOOP (isupper);
3541 else if (strcmp (name, "blank") == 0)
3542 BUILD_CHARCLASS_LOOP (isblank);
3543 else if (strcmp (name, "graph") == 0)
3544 BUILD_CHARCLASS_LOOP (isgraph);
3545 else if (strcmp (name, "punct") == 0)
3546 BUILD_CHARCLASS_LOOP (ispunct);
3547 else if (strcmp (name, "xdigit") == 0)
3548 BUILD_CHARCLASS_LOOP (isxdigit);
3556 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3557 const unsigned char *class_name,
3558 const unsigned char *extra, bool non_match,
3561 re_bitset_ptr_t sbcset;
3562 #ifdef RE_ENABLE_I18N
3563 re_charset_t *mbcset;
3565 #endif /* not RE_ENABLE_I18N */
3567 re_token_t br_token;
3570 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3571 #ifdef RE_ENABLE_I18N
3572 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3573 #endif /* RE_ENABLE_I18N */
3575 #ifdef RE_ENABLE_I18N
3576 if (BE (sbcset == NULL || mbcset == NULL, 0))
3577 #else /* not RE_ENABLE_I18N */
3578 if (BE (sbcset == NULL, 0))
3579 #endif /* not RE_ENABLE_I18N */
3587 #ifdef RE_ENABLE_I18N
3588 mbcset->non_match = 1;
3589 #endif /* not RE_ENABLE_I18N */
3592 /* We don't care the syntax in this case. */
3593 ret = build_charclass (trans, sbcset,
3594 #ifdef RE_ENABLE_I18N
3596 #endif /* RE_ENABLE_I18N */
3599 if (BE (ret != REG_NOERROR, 0))
3602 #ifdef RE_ENABLE_I18N
3603 free_charset (mbcset);
3604 #endif /* RE_ENABLE_I18N */
3608 /* \w match '_' also. */
3609 for (; *extra; extra++)
3610 bitset_set (sbcset, *extra);
3612 /* If it is non-matching list. */
3614 bitset_not (sbcset);
3616 #ifdef RE_ENABLE_I18N
3617 /* Ensure only single byte characters are set. */
3618 if (dfa->mb_cur_max > 1)
3619 bitset_mask (sbcset, dfa->sb_char);
3622 /* Build a tree for simple bracket. */
3623 br_token.type = SIMPLE_BRACKET;
3624 br_token.opr.sbcset = sbcset;
3625 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3626 if (BE (tree == NULL, 0))
3627 goto build_word_op_espace;
3629 #ifdef RE_ENABLE_I18N
3630 if (dfa->mb_cur_max > 1)
3632 bin_tree_t *mbc_tree;
3633 /* Build a tree for complex bracket. */
3634 br_token.type = COMPLEX_BRACKET;
3635 br_token.opr.mbcset = mbcset;
3636 dfa->has_mb_node = 1;
3637 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3638 if (BE (mbc_tree == NULL, 0))
3639 goto build_word_op_espace;
3640 /* Then join them by ALT node. */
3641 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3642 if (BE (mbc_tree != NULL, 1))
3647 free_charset (mbcset);
3650 #else /* not RE_ENABLE_I18N */
3652 #endif /* not RE_ENABLE_I18N */
3654 build_word_op_espace:
3656 #ifdef RE_ENABLE_I18N
3657 free_charset (mbcset);
3658 #endif /* RE_ENABLE_I18N */
3663 /* This is intended for the expressions like "a{1,3}".
3664 Fetch a number from `input', and return the number.
3665 Return REG_MISSING if the number field is empty like "{,1}".
3666 Return REG_ERROR if an error occurred. */
3669 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3671 Idx num = REG_MISSING;
3675 fetch_token (token, input, syntax);
3677 if (BE (token->type == END_OF_RE, 0))
3679 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3681 num = ((token->type != CHARACTER || c < '0' || '9' < c
3682 || num == REG_ERROR)
3684 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3685 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3690 #ifdef RE_ENABLE_I18N
3692 free_charset (re_charset_t *cset)
3694 re_free (cset->mbchars);
3696 re_free (cset->coll_syms);
3697 re_free (cset->equiv_classes);
3698 re_free (cset->range_starts);
3699 re_free (cset->range_ends);
3701 re_free (cset->char_classes);
3704 #endif /* RE_ENABLE_I18N */
3706 /* Functions for binary tree operation. */
3708 /* Create a tree node. */
3711 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3712 re_token_type_t type)
3716 return create_token_tree (dfa, left, right, &t);
3720 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3721 const re_token_t *token)
3724 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3726 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3728 if (storage == NULL)
3730 storage->next = dfa->str_tree_storage;
3731 dfa->str_tree_storage = storage;
3732 dfa->str_tree_storage_idx = 0;
3734 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3736 tree->parent = NULL;
3738 tree->right = right;
3739 tree->token = *token;
3740 tree->token.duplicated = 0;
3741 tree->token.opt_subexp = 0;
3744 tree->node_idx = REG_MISSING;
3747 left->parent = tree;
3749 right->parent = tree;
3753 /* Mark the tree SRC as an optional subexpression.
3754 To be called from preorder or postorder. */
3756 static reg_errcode_t
3757 mark_opt_subexp (void *extra, bin_tree_t *node)
3759 Idx idx = (Idx) (long) extra;
3760 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3761 node->token.opt_subexp = 1;
3766 /* Free the allocated memory inside NODE. */
3769 free_token (re_token_t *node)
3771 #ifdef RE_ENABLE_I18N
3772 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3773 free_charset (node->opr.mbcset);
3775 #endif /* RE_ENABLE_I18N */
3776 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3777 re_free (node->opr.sbcset);
3780 /* Worker function for tree walking. Free the allocated memory inside NODE
3781 and its children. */
3783 static reg_errcode_t
3784 free_tree (void *extra, bin_tree_t *node)
3786 free_token (&node->token);
3791 /* Duplicate the node SRC, and return new node. This is a preorder
3792 visit similar to the one implemented by the generic visitor, but
3793 we need more infrastructure to maintain two parallel trees --- so,
3794 it's easier to duplicate. */
3797 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3799 const bin_tree_t *node;
3800 bin_tree_t *dup_root;
3801 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3803 for (node = root; ; )
3805 /* Create a new tree and link it back to the current parent. */
3806 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3809 (*p_new)->parent = dup_node;
3810 (*p_new)->token.duplicated = 1;
3813 /* Go to the left node, or up and to the right. */
3817 p_new = &dup_node->left;
3821 const bin_tree_t *prev = NULL;
3822 while (node->right == prev || node->right == NULL)
3825 node = node->parent;
3826 dup_node = dup_node->parent;
3831 p_new = &dup_node->right;