git.gag.com Git - debian/tar/blob - gnu/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002-2011 Free Software Foundation, Inc.
   5    This file is part of the GNU C Library.
   6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   7
   8    This program is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License along
  19    with this program; if not, write to the Free Software Foundation,
  20    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  21
  22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  23                                           size_t length, reg_syntax_t syntax);
  24 static void re_compile_fastmap_iter (regex_t *bufp,
  25                                      const re_dfastate_t *init_state,
  26                                      char *fastmap);
  27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  52                                    unsigned int constraint);
  53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  55                                          Idx node, bool root);
  56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  57 static Idx fetch_number (re_string_t *input, re_token_t *token,
  58                          reg_syntax_t syntax);
  59 static int peek_token (re_token_t *token, re_string_t *input,
  60                         reg_syntax_t syntax) internal_function;
  61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  62                           reg_syntax_t syntax, reg_errcode_t *err);
  63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  64                                   re_token_t *token, reg_syntax_t syntax,
  65                                   Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  67                                  re_token_t *token, reg_syntax_t syntax,
  68                                  Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  70                                      re_token_t *token, reg_syntax_t syntax,
  71                                      Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   Idx nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  76                                  re_dfa_t *dfa, re_token_t *token,
  77                                  reg_syntax_t syntax, reg_errcode_t *err);
  78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  79                                       re_token_t *token, reg_syntax_t syntax,
  80                                       reg_errcode_t *err);
  81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  82                                             re_string_t *regexp,
  83                                             re_token_t *token, int token_len,
  84                                             re_dfa_t *dfa,
  85                                             reg_syntax_t syntax,
  86                                             bool accept_hyphen);
  87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  88                                           re_string_t *regexp,
  89                                           re_token_t *token);
  90 #ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  92                                         re_charset_t *mbcset,
  93                                         Idx *equiv_class_alloc,
  94                                         const unsigned char *name);
  95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  96                                       bitset_t sbcset,
  97                                       re_charset_t *mbcset,
  98                                       Idx *char_class_alloc,
  99                                       const unsigned char *class_name,
 100                                       reg_syntax_t syntax);
 101 #else  /* not RE_ENABLE_I18N */
 102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 103                                         const unsigned char *name);
 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 105                                       bitset_t sbcset,
 106                                       const unsigned char *class_name,
 107                                       reg_syntax_t syntax);
 108 #endif /* not RE_ENABLE_I18N */
 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 110                                        RE_TRANSLATE_TYPE trans,
 111                                        const unsigned char *class_name,
 112                                        const unsigned char *extra,
 113                                        bool non_match, reg_errcode_t *err);
 114 static bin_tree_t *create_tree (re_dfa_t *dfa,
 115                                 bin_tree_t *left, bin_tree_t *right,
 116                                 re_token_type_t type);
 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 118                                       bin_tree_t *left, bin_tree_t *right,
 119                                       const re_token_t *token);
 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 121 static void free_token (re_token_t *node);
 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 static const char __re_error_msgid[] =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 static const size_t __re_error_msgid_idx[] =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 212    are set in BUFP on entry.  */
 213
 214 #ifdef _LIBC
 215 const char *
 216 re_compile_pattern (pattern, length, bufp)
 217     const char *pattern;
 218     size_t length;
 219     struct re_pattern_buffer *bufp;
 220 #else /* size_t might promote */
 221 const char *
 222 re_compile_pattern (const char *pattern, size_t length,
 223                     struct re_pattern_buffer *bufp)
 224 #endif
 225 {
 226   reg_errcode_t ret;
 227
 228   /* And GNU code determines whether or not to get register information
 229      by passing null for the REGS argument to re_match, etc., not by
 230      setting no_sub, unless RE_NO_SUB is set.  */
 231   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 232
 233   /* Match anchors at newline.  */
 234   bufp->newline_anchor = 1;
 235
 236   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 237
 238   if (!ret)
 239     return NULL;
 240   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 241 }
 242 #ifdef _LIBC
 243 weak_alias (__re_compile_pattern, re_compile_pattern)
 244 #endif
 245
 246 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 247    also be assigned to arbitrarily: each pattern buffer stores its own
 248    syntax, so it can be changed between regex compilations.  */
 249 /* This has no initializer because initialized variables in Emacs
 250    become read-only after dumping.  */
 251 reg_syntax_t re_syntax_options;
 252
 253
 254 /* Specify the precise syntax of regexps for compilation.  This provides
 255    for compatibility for various utilities which historically have
 256    different, incompatible syntaxes.
 257
 258    The argument SYNTAX is a bit mask comprised of the various bits
 259    defined in regex.h.  We return the old syntax.  */
 260
 261 reg_syntax_t
 262 re_set_syntax (syntax)
 263     reg_syntax_t syntax;
 264 {
 265   reg_syntax_t ret = re_syntax_options;
 266
 267   re_syntax_options = syntax;
 268   return ret;
 269 }
 270 #ifdef _LIBC
 271 weak_alias (__re_set_syntax, re_set_syntax)
 272 #endif
 273
 274 int
 275 re_compile_fastmap (bufp)
 276     struct re_pattern_buffer *bufp;
 277 {
 278   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 279   char *fastmap = bufp->fastmap;
 280
 281   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 282   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 283   if (dfa->init_state != dfa->init_state_word)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 285   if (dfa->init_state != dfa->init_state_nl)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 287   if (dfa->init_state != dfa->init_state_begbuf)
 288     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 289   bufp->fastmap_accurate = 1;
 290   return 0;
 291 }
 292 #ifdef _LIBC
 293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 294 #endif
 295
 296 static inline void
 297 __attribute ((always_inline))
 298 re_set_fastmap (char *fastmap, bool icase, int ch)
 299 {
 300   fastmap[ch] = 1;
 301   if (icase)
 302     fastmap[tolower (ch)] = 1;
 303 }
 304
 305 /* Helper function for re_compile_fastmap.
 306    Compile fastmap for the initial_state INIT_STATE.  */
 307
 308 static void
 309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 310                          char *fastmap)
 311 {
 312   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 313   Idx node_cnt;
 314   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 315   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 316     {
 317       Idx node = init_state->nodes.elems[node_cnt];
 318       re_token_type_t type = dfa->nodes[node].type;
 319
 320       if (type == CHARACTER)
 321         {
 322           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 323 #ifdef RE_ENABLE_I18N
 324           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 325             {
 326               unsigned char buf[MB_LEN_MAX];
 327               unsigned char *p;
 328               wchar_t wc;
 329               mbstate_t state;
 330
 331               p = buf;
 332               *p++ = dfa->nodes[node].opr.c;
 333               while (++node < dfa->nodes_len
 334                      && dfa->nodes[node].type == CHARACTER
 335                      && dfa->nodes[node].mb_partial)
 336                 *p++ = dfa->nodes[node].opr.c;
 337               memset (&state, '\0', sizeof (state));
 338               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 339                              &state) == p - buf
 340                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 341                       != (size_t) -1))
 342                 re_set_fastmap (fastmap, false, buf[0]);
 343             }
 344 #endif
 345         }
 346       else if (type == SIMPLE_BRACKET)
 347         {
 348           int i, ch;
 349           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 350             {
 351               int j;
 352               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 353               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 354                 if (w & ((bitset_word_t) 1 << j))
 355                   re_set_fastmap (fastmap, icase, ch);
 356             }
 357         }
 358 #ifdef RE_ENABLE_I18N
 359       else if (type == COMPLEX_BRACKET)
 360         {
 361           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 362           Idx i;
 363
 364 # ifdef _LIBC
 365           /* See if we have to try all bytes which start multiple collation
 366              elements.
 367              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 368                   collation element, and don't catch 'b' since 'b' is
 369                   the only collation element which starts from 'b' (and
 370                   it is caught by SIMPLE_BRACKET).  */
 371               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 372                   && (cset->ncoll_syms || cset->nranges))
 373                 {
 374                   const int32_t *table = (const int32_t *)
 375                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 376                   for (i = 0; i < SBC_MAX; ++i)
 377                     if (table[i] < 0)
 378                       re_set_fastmap (fastmap, icase, i);
 379                 }
 380 # endif /* _LIBC */
 381
 382           /* See if we have to start the match at all multibyte characters,
 383              i.e. where we would not find an invalid sequence.  This only
 384              applies to multibyte character sets; for single byte character
 385              sets, the SIMPLE_BRACKET again suffices.  */
 386           if (dfa->mb_cur_max > 1
 387               && (cset->nchar_classes || cset->non_match || cset->nranges
 388 # ifdef _LIBC
 389                   || cset->nequiv_classes
 390 # endif /* _LIBC */
 391                  ))
 392             {
 393               unsigned char c = 0;
 394               do
 395                 {
 396                   mbstate_t mbs;
 397                   memset (&mbs, 0, sizeof (mbs));
 398                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 399                     re_set_fastmap (fastmap, false, (int) c);
 400                 }
 401               while (++c != 0);
 402             }
 403
 404           else
 405             {
 406               /* ... Else catch all bytes which can start the mbchars.  */
 407               for (i = 0; i < cset->nmbchars; ++i)
 408                 {
 409                   char buf[256];
 410                   mbstate_t state;
 411                   memset (&state, '\0', sizeof (state));
 412                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 413                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                     {
 416                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 417                           != (size_t) -1)
 418                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 419                     }
 420                 }
 421             }
 422         }
 423 #endif /* RE_ENABLE_I18N */
 424       else if (type == OP_PERIOD
 425 #ifdef RE_ENABLE_I18N
 426                || type == OP_UTF8_PERIOD
 427 #endif /* RE_ENABLE_I18N */
 428                || type == END_OF_RE)
 429         {
 430           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 431           if (type == END_OF_RE)
 432             bufp->can_be_null = 1;
 433           return;
 434         }
 435     }
 436 }
 437 \f
 438 /* Entry point for POSIX code.  */
 439 /* regcomp takes a regular expression as a string and compiles it.
 440
 441    PREG is a regex_t *.  We do not expect any fields to be initialized,
 442    since POSIX says we shouldn't.  Thus, we set
 443
 444      `buffer' to the compiled pattern;
 445      `used' to the length of the compiled pattern;
 446      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 447        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 448        RE_SYNTAX_POSIX_BASIC;
 449      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 450      `fastmap' to an allocated space for the fastmap;
 451      `fastmap_accurate' to zero;
 452      `re_nsub' to the number of subexpressions in PATTERN.
 453
 454    PATTERN is the address of the pattern string.
 455
 456    CFLAGS is a series of bits which affect compilation.
 457
 458      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 459      use POSIX basic syntax.
 460
 461      If REG_NEWLINE is set, then . and [^...] don't match newline.
 462      Also, regexec will try a match beginning after every newline.
 463
 464      If REG_ICASE is set, then we considers upper- and lowercase
 465      versions of letters to be equivalent when matching.
 466
 467      If REG_NOSUB is set, then when PREG is passed to regexec, that
 468      routine will report only success or failure, and nothing about the
 469      registers.
 470
 471    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 472    the return codes and their meanings.)  */
 473
 474 int
 475 regcomp (preg, pattern, cflags)
 476     regex_t *_Restrict_ preg;
 477     const char *_Restrict_ pattern;
 478     int cflags;
 479 {
 480   reg_errcode_t ret;
 481   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 482                          : RE_SYNTAX_POSIX_BASIC);
 483
 484   preg->buffer = NULL;
 485   preg->allocated = 0;
 486   preg->used = 0;
 487
 488   /* Try to allocate space for the fastmap.  */
 489   preg->fastmap = re_malloc (char, SBC_MAX);
 490   if (BE (preg->fastmap == NULL, 0))
 491     return REG_ESPACE;
 492
 493   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 494
 495   /* If REG_NEWLINE is set, newlines are treated differently.  */
 496   if (cflags & REG_NEWLINE)
 497     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 498       syntax &= ~RE_DOT_NEWLINE;
 499       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 500       /* It also changes the matching behavior.  */
 501       preg->newline_anchor = 1;
 502     }
 503   else
 504     preg->newline_anchor = 0;
 505   preg->no_sub = !!(cflags & REG_NOSUB);
 506   preg->translate = NULL;
 507
 508   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 509
 510   /* POSIX doesn't distinguish between an unmatched open-group and an
 511      unmatched close-group: both are REG_EPAREN.  */
 512   if (ret == REG_ERPAREN)
 513     ret = REG_EPAREN;
 514
 515   /* We have already checked preg->fastmap != NULL.  */
 516   if (BE (ret == REG_NOERROR, 1))
 517     /* Compute the fastmap now, since regexec cannot modify the pattern
 518        buffer.  This function never fails in this implementation.  */
 519     (void) re_compile_fastmap (preg);
 520   else
 521     {
 522       /* Some error occurred while compiling the expression.  */
 523       re_free (preg->fastmap);
 524       preg->fastmap = NULL;
 525     }
 526
 527   return (int) ret;
 528 }
 529 #ifdef _LIBC
 530 weak_alias (__regcomp, regcomp)
 531 #endif
 532
 533 /* Returns a message corresponding to an error code, ERRCODE, returned
 534    from either regcomp or regexec.   We don't use PREG here.  */
 535
 536 #ifdef _LIBC
 537 size_t
 538 regerror (errcode, preg, errbuf, errbuf_size)
 539     int errcode;
 540     const regex_t *_Restrict_ preg;
 541     char *_Restrict_ errbuf;
 542     size_t errbuf_size;
 543 #else /* size_t might promote */
 544 size_t
 545 regerror (int errcode, const regex_t *_Restrict_ preg,
 546           char *_Restrict_ errbuf, size_t errbuf_size)
 547 #endif
 548 {
 549   const char *msg;
 550   size_t msg_size;
 551
 552   if (BE (errcode < 0
 553           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 554                                / sizeof (__re_error_msgid_idx[0])), 0))
 555     /* Only error codes returned by the rest of the code should be passed
 556        to this routine.  If we are given anything else, or if other regex
 557        code generates an invalid error code, then the program has a bug.
 558        Dump core so we can fix it.  */
 559     abort ();
 560
 561   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 562
 563   msg_size = strlen (msg) + 1; /* Includes the null.  */
 564
 565   if (BE (errbuf_size != 0, 1))
 566     {
 567       size_t cpy_size = msg_size;
 568       if (BE (msg_size > errbuf_size, 0))
 569         {
 570           cpy_size = errbuf_size - 1;
 571           errbuf[cpy_size] = '\0';
 572         }
 573       memcpy (errbuf, msg, cpy_size);
 574     }
 575
 576   return msg_size;
 577 }
 578 #ifdef _LIBC
 579 weak_alias (__regerror, regerror)
 580 #endif
 581
 582
 583 #ifdef RE_ENABLE_I18N
 584 /* This static array is used for the map to single-byte characters when
 585    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 586    it the same all the time.  UTF-8 is the preferred encoding so this is
 587    a worthwhile optimization.  */
 588 static const bitset_t utf8_sb_map =
 589 {
 590   /* Set the first 128 bits.  */
 591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 592 #  error "bitset_word_t is narrower than 32 bits"
 593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX, BITSET_WORD_MAX,
 597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 598   BITSET_WORD_MAX,
 599 # endif
 600   (BITSET_WORD_MAX
 601    >> (SBC_MAX % BITSET_WORD_BITS == 0
 602        ? 0
 603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 604 };
 605 #endif
 606
 607
 608 static void
 609 free_dfa_content (re_dfa_t *dfa)
 610 {
 611   Idx i, j;
 612
 613   if (dfa->nodes)
 614     for (i = 0; i < dfa->nodes_len; ++i)
 615       free_token (dfa->nodes + i);
 616   re_free (dfa->nexts);
 617   for (i = 0; i < dfa->nodes_len; ++i)
 618     {
 619       if (dfa->eclosures != NULL)
 620         re_node_set_free (dfa->eclosures + i);
 621       if (dfa->inveclosures != NULL)
 622         re_node_set_free (dfa->inveclosures + i);
 623       if (dfa->edests != NULL)
 624         re_node_set_free (dfa->edests + i);
 625     }
 626   re_free (dfa->edests);
 627   re_free (dfa->eclosures);
 628   re_free (dfa->inveclosures);
 629   re_free (dfa->nodes);
 630
 631   if (dfa->state_table)
 632     for (i = 0; i <= dfa->state_hash_mask; ++i)
 633       {
 634         struct re_state_table_entry *entry = dfa->state_table + i;
 635         for (j = 0; j < entry->num; ++j)
 636           {
 637             re_dfastate_t *state = entry->array[j];
 638             free_state (state);
 639           }
 640         re_free (entry->array);
 641       }
 642   re_free (dfa->state_table);
 643 #ifdef RE_ENABLE_I18N
 644   if (dfa->sb_char != utf8_sb_map)
 645     re_free (dfa->sb_char);
 646 #endif
 647   re_free (dfa->subexp_map);
 648 #ifdef DEBUG
 649   re_free (dfa->re_str);
 650 #endif
 651
 652   re_free (dfa);
 653 }
 654
 655
 656 /* Free dynamically allocated space used by PREG.  */
 657
 658 void
 659 regfree (preg)
 660     regex_t *preg;
 661 {
 662   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 663   if (BE (dfa != NULL, 1))
 664     free_dfa_content (dfa);
 665   preg->buffer = NULL;
 666   preg->allocated = 0;
 667
 668   re_free (preg->fastmap);
 669   preg->fastmap = NULL;
 670
 671   re_free (preg->translate);
 672   preg->translate = NULL;
 673 }
 674 #ifdef _LIBC
 675 weak_alias (__regfree, regfree)
 676 #endif
 677 \f
 678 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 679    them unless specifically requested.  */
 680
 681 #if defined _REGEX_RE_COMP || defined _LIBC
 682
 683 /* BSD has one and only one pattern buffer.  */
 684 static struct re_pattern_buffer re_comp_buf;
 685
 686 char *
 687 # ifdef _LIBC
 688 /* Make these definitions weak in libc, so POSIX programs can redefine
 689    these names if they don't use our functions, and still use
 690    regcomp/regexec above without link errors.  */
 691 weak_function
 692 # endif
 693 re_comp (s)
 694      const char *s;
 695 {
 696   reg_errcode_t ret;
 697   char *fastmap;
 698
 699   if (!s)
 700     {
 701       if (!re_comp_buf.buffer)
 702         return gettext ("No previous regular expression");
 703       return 0;
 704     }
 705
 706   if (re_comp_buf.buffer)
 707     {
 708       fastmap = re_comp_buf.fastmap;
 709       re_comp_buf.fastmap = NULL;
 710       __regfree (&re_comp_buf);
 711       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 712       re_comp_buf.fastmap = fastmap;
 713     }
 714
 715   if (re_comp_buf.fastmap == NULL)
 716     {
 717       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 718       if (re_comp_buf.fastmap == NULL)
 719         return (char *) gettext (__re_error_msgid
 720                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 721     }
 722
 723   /* Since `re_exec' always passes NULL for the `regs' argument, we
 724      don't need to initialize the pattern buffer fields which affect it.  */
 725
 726   /* Match anchors at newlines.  */
 727   re_comp_buf.newline_anchor = 1;
 728
 729   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 730
 731   if (!ret)
 732     return NULL;
 733
 734   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 735   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 736 }
 737
 738 #ifdef _LIBC
 739 libc_freeres_fn (free_mem)
 740 {
 741   __regfree (&re_comp_buf);
 742 }
 743 #endif
 744
 745 #endif /* _REGEX_RE_COMP */
 746 \f
 747 /* Internal entry point.
 748    Compile the regular expression PATTERN, whose length is LENGTH.
 749    SYNTAX indicate regular expression's syntax.  */
 750
 751 static reg_errcode_t
 752 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 753                      reg_syntax_t syntax)
 754 {
 755   reg_errcode_t err = REG_NOERROR;
 756   re_dfa_t *dfa;
 757   re_string_t regexp;
 758
 759   /* Initialize the pattern buffer.  */
 760   preg->fastmap_accurate = 0;
 761   preg->syntax = syntax;
 762   preg->not_bol = preg->not_eol = 0;
 763   preg->used = 0;
 764   preg->re_nsub = 0;
 765   preg->can_be_null = 0;
 766   preg->regs_allocated = REGS_UNALLOCATED;
 767
 768   /* Initialize the dfa.  */
 769   dfa = (re_dfa_t *) preg->buffer;
 770   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 771     {
 772       /* If zero allocated, but buffer is non-null, try to realloc
 773          enough space.  This loses if buffer's address is bogus, but
 774          that is the user's responsibility.  If ->buffer is NULL this
 775          is a simple allocation.  */
 776       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 777       if (dfa == NULL)
 778         return REG_ESPACE;
 779       preg->allocated = sizeof (re_dfa_t);
 780       preg->buffer = (unsigned char *) dfa;
 781     }
 782   preg->used = sizeof (re_dfa_t);
 783
 784   err = init_dfa (dfa, length);
 785   if (BE (err != REG_NOERROR, 0))
 786     {
 787       free_dfa_content (dfa);
 788       preg->buffer = NULL;
 789       preg->allocated = 0;
 790       return err;
 791     }
 792 #ifdef DEBUG
 793   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 794   dfa->re_str = re_malloc (char, length + 1);
 795   strncpy (dfa->re_str, pattern, length + 1);
 796 #endif
 797
 798   __libc_lock_init (dfa->lock);
 799
 800   err = re_string_construct (&regexp, pattern, length, preg->translate,
 801                              (syntax & RE_ICASE) != 0, dfa);
 802   if (BE (err != REG_NOERROR, 0))
 803     {
 804     re_compile_internal_free_return:
 805       free_workarea_compile (preg);
 806       re_string_destruct (&regexp);
 807       free_dfa_content (dfa);
 808       preg->buffer = NULL;
 809       preg->allocated = 0;
 810       return err;
 811     }
 812
 813   /* Parse the regular expression, and build a structure tree.  */
 814   preg->re_nsub = 0;
 815   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 816   if (BE (dfa->str_tree == NULL, 0))
 817     goto re_compile_internal_free_return;
 818
 819   /* Analyze the tree and create the nfa.  */
 820   err = analyze (preg);
 821   if (BE (err != REG_NOERROR, 0))
 822     goto re_compile_internal_free_return;
 823
 824 #ifdef RE_ENABLE_I18N
 825   /* If possible, do searching in single byte encoding to speed things up.  */
 826   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 827     optimize_utf8 (dfa);
 828 #endif
 829
 830   /* Then create the initial state of the dfa.  */
 831   err = create_initial_state (dfa);
 832
 833   /* Release work areas.  */
 834   free_workarea_compile (preg);
 835   re_string_destruct (&regexp);
 836
 837   if (BE (err != REG_NOERROR, 0))
 838     {
 839       free_dfa_content (dfa);
 840       preg->buffer = NULL;
 841       preg->allocated = 0;
 842     }
 843
 844   return err;
 845 }
 846
 847 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 848    as the initial length of some arrays.  */
 849
 850 static reg_errcode_t
 851 init_dfa (re_dfa_t *dfa, size_t pat_len)
 852 {
 853   __re_size_t table_size;
 854 #ifndef _LIBC
 855   char *codeset_name;
 856 #endif
 857 #ifdef RE_ENABLE_I18N
 858   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 859 #else
 860   size_t max_i18n_object_size = 0;
 861 #endif
 862   size_t max_object_size =
 863     MAX (sizeof (struct re_state_table_entry),
 864          MAX (sizeof (re_token_t),
 865               MAX (sizeof (re_node_set),
 866                    MAX (sizeof (regmatch_t),
 867                         max_i18n_object_size))));
 868
 869   memset (dfa, '\0', sizeof (re_dfa_t));
 870
 871   /* Force allocation of str_tree_storage the first time.  */
 872   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 873
 874   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 875      calculation below, and for similar doubling calculations
 876      elsewhere.  And it's <= rather than <, because some of the
 877      doubling calculations add 1 afterwards.  */
 878   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 879     return REG_ESPACE;
 880
 881   dfa->nodes_alloc = pat_len + 1;
 882   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 883
 884   /*  table_size = 2 ^ ceil(log pat_len) */
 885   for (table_size = 1; ; table_size <<= 1)
 886     if (table_size > pat_len)
 887       break;
 888
 889   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 890   dfa->state_hash_mask = table_size - 1;
 891
 892   dfa->mb_cur_max = MB_CUR_MAX;
 893 #ifdef _LIBC
 894   if (dfa->mb_cur_max == 6
 895       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 896     dfa->is_utf8 = 1;
 897   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 898                        != 0);
 899 #else
 900   codeset_name = nl_langinfo (CODESET);
 901   if (strcasecmp (codeset_name, "UTF-8") == 0
 902       || strcasecmp (codeset_name, "UTF8") == 0)
 903     dfa->is_utf8 = 1;
 904
 905   /* We check exhaustively in the loop below if this charset is a
 906      superset of ASCII.  */
 907   dfa->map_notascii = 0;
 908 #endif
 909
 910 #ifdef RE_ENABLE_I18N
 911   if (dfa->mb_cur_max > 1)
 912     {
 913       if (dfa->is_utf8)
 914         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 915       else
 916         {
 917           int i, j, ch;
 918
 919           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 920           if (BE (dfa->sb_char == NULL, 0))
 921             return REG_ESPACE;
 922
 923           /* Set the bits corresponding to single byte chars.  */
 924           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 925             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 926               {
 927                 wint_t wch = __btowc (ch);
 928                 if (wch != WEOF)
 929                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 930 # ifndef _LIBC
 931                 if (isascii (ch) && wch != ch)
 932                   dfa->map_notascii = 1;
 933 # endif
 934               }
 935         }
 936     }
 937 #endif
 938
 939   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 940     return REG_ESPACE;
 941   return REG_NOERROR;
 942 }
 943
 944 /* Initialize WORD_CHAR table, which indicate which character is
 945    "word".  In this case "word" means that it is the word construction
 946    character used by some operators like "\<", "\>", etc.  */
 947
 948 static void
 949 internal_function
 950 init_word_char (re_dfa_t *dfa)
 951 {
 952   int i, j, ch;
 953   dfa->word_ops_used = 1;
 954   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 955     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 956       if (isalnum (ch) || ch == '_')
 957         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 958 }
 959
 960 /* Free the work area which are only used while compiling.  */
 961
 962 static void
 963 free_workarea_compile (regex_t *preg)
 964 {
 965   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 966   bin_tree_storage_t *storage, *next;
 967   for (storage = dfa->str_tree_storage; storage; storage = next)
 968     {
 969       next = storage->next;
 970       re_free (storage);
 971     }
 972   dfa->str_tree_storage = NULL;
 973   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 974   dfa->str_tree = NULL;
 975   re_free (dfa->org_indices);
 976   dfa->org_indices = NULL;
 977 }
 978
 979 /* Create initial states for all contexts.  */
 980
 981 static reg_errcode_t
 982 create_initial_state (re_dfa_t *dfa)
 983 {
 984   Idx first, i;
 985   reg_errcode_t err;
 986   re_node_set init_nodes;
 987
 988   /* Initial states have the epsilon closure of the node which is
 989      the first node of the regular expression.  */
 990   first = dfa->str_tree->first->node_idx;
 991   dfa->init_node = first;
 992   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 993   if (BE (err != REG_NOERROR, 0))
 994     return err;
 995
 996   /* The back-references which are in initial states can epsilon transit,
 997      since in this case all of the subexpressions can be null.
 998      Then we add epsilon closures of the nodes which are the next nodes of
 999      the back-references.  */
1000   if (dfa->nbackref > 0)
1001     for (i = 0; i < init_nodes.nelem; ++i)
1002       {
1003         Idx node_idx = init_nodes.elems[i];
1004         re_token_type_t type = dfa->nodes[node_idx].type;
1005
1006         Idx clexp_idx;
1007         if (type != OP_BACK_REF)
1008           continue;
1009         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010           {
1011             re_token_t *clexp_node;
1012             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1013             if (clexp_node->type == OP_CLOSE_SUBEXP
1014                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1015               break;
1016           }
1017         if (clexp_idx == init_nodes.nelem)
1018           continue;
1019
1020         if (type == OP_BACK_REF)
1021           {
1022             Idx dest_idx = dfa->edests[node_idx].elems[0];
1023             if (!re_node_set_contains (&init_nodes, dest_idx))
1024               {
1025                 reg_errcode_t merge_err
1026                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027                 if (merge_err != REG_NOERROR)
1028                   return merge_err;
1029                 i = 0;
1030               }
1031           }
1032       }
1033
1034   /* It must be the first time to invoke acquire_state.  */
1035   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1036   /* We don't check ERR here, since the initial state must not be NULL.  */
1037   if (BE (dfa->init_state == NULL, 0))
1038     return err;
1039   if (dfa->init_state->has_constraint)
1040     {
1041       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1042                                                        CONTEXT_WORD);
1043       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1044                                                      CONTEXT_NEWLINE);
1045       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1046                                                          &init_nodes,
1047                                                          CONTEXT_NEWLINE
1048                                                          | CONTEXT_BEGBUF);
1049       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1050               || dfa->init_state_begbuf == NULL, 0))
1051         return err;
1052     }
1053   else
1054     dfa->init_state_word = dfa->init_state_nl
1055       = dfa->init_state_begbuf = dfa->init_state;
1056
1057   re_node_set_free (&init_nodes);
1058   return REG_NOERROR;
1059 }
1060 \f
1061 #ifdef RE_ENABLE_I18N
1062 /* If it is possible to do searching in single byte encoding instead of UTF-8
1063    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1064    DFA nodes where needed.  */
1065
1066 static void
1067 optimize_utf8 (re_dfa_t *dfa)
1068 {
1069   Idx node;
1070   int i;
1071   bool mb_chars = false;
1072   bool has_period = false;
1073
1074   for (node = 0; node < dfa->nodes_len; ++node)
1075     switch (dfa->nodes[node].type)
1076       {
1077       case CHARACTER:
1078         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1079           mb_chars = true;
1080         break;
1081       case ANCHOR:
1082         switch (dfa->nodes[node].opr.ctx_type)
1083           {
1084           case LINE_FIRST:
1085           case LINE_LAST:
1086           case BUF_FIRST:
1087           case BUF_LAST:
1088             break;
1089           default:
1090             /* Word anchors etc. cannot be handled.  It's okay to test
1091                opr.ctx_type since constraints (for all DFA nodes) are
1092                created by ORing one or more opr.ctx_type values.  */
1093             return;
1094           }
1095         break;
1096       case OP_PERIOD:
1097         has_period = true;
1098         break;
1099       case OP_BACK_REF:
1100       case OP_ALT:
1101       case END_OF_RE:
1102       case OP_DUP_ASTERISK:
1103       case OP_OPEN_SUBEXP:
1104       case OP_CLOSE_SUBEXP:
1105         break;
1106       case COMPLEX_BRACKET:
1107         return;
1108       case SIMPLE_BRACKET:
1109         /* Just double check.  */
1110         {
1111           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112                         ? 0
1113                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1114           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115             {
1116               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1117                 return;
1118               rshift = 0;
1119             }
1120         }
1121         break;
1122       default:
1123         abort ();
1124       }
1125
1126   if (mb_chars || has_period)
1127     for (node = 0; node < dfa->nodes_len; ++node)
1128       {
1129         if (dfa->nodes[node].type == CHARACTER
1130             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1131           dfa->nodes[node].mb_partial = 0;
1132         else if (dfa->nodes[node].type == OP_PERIOD)
1133           dfa->nodes[node].type = OP_UTF8_PERIOD;
1134       }
1135
1136   /* The search can be in single byte locale.  */
1137   dfa->mb_cur_max = 1;
1138   dfa->is_utf8 = 0;
1139   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1140 }
1141 #endif
1142 \f
1143 /* Analyze the structure tree, and calculate "first", "next", "edest",
1144    "eclosure", and "inveclosure".  */
1145
1146 static reg_errcode_t
1147 analyze (regex_t *preg)
1148 {
1149   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1150   reg_errcode_t ret;
1151
1152   /* Allocate arrays.  */
1153   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1154   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1156   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1157   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1158           || dfa->eclosures == NULL, 0))
1159     return REG_ESPACE;
1160
1161   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1162   if (dfa->subexp_map != NULL)
1163     {
1164       Idx i;
1165       for (i = 0; i < preg->re_nsub; i++)
1166         dfa->subexp_map[i] = i;
1167       preorder (dfa->str_tree, optimize_subexps, dfa);
1168       for (i = 0; i < preg->re_nsub; i++)
1169         if (dfa->subexp_map[i] != i)
1170           break;
1171       if (i == preg->re_nsub)
1172         {
1173           free (dfa->subexp_map);
1174           dfa->subexp_map = NULL;
1175         }
1176     }
1177
1178   ret = postorder (dfa->str_tree, lower_subexps, preg);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181   ret = postorder (dfa->str_tree, calc_first, dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184   preorder (dfa->str_tree, calc_next, dfa);
1185   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1186   if (BE (ret != REG_NOERROR, 0))
1187     return ret;
1188   ret = calc_eclosure (dfa);
1189   if (BE (ret != REG_NOERROR, 0))
1190     return ret;
1191
1192   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1193      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1194   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1195       || dfa->nbackref)
1196     {
1197       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1198       if (BE (dfa->inveclosures == NULL, 0))
1199         return REG_ESPACE;
1200       ret = calc_inveclosure (dfa);
1201     }
1202
1203   return ret;
1204 }
1205
1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
1207    implement parse tree visits.  Instead, we use parent pointers and
1208    some hairy code in these two functions.  */
1209 static reg_errcode_t
1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211            void *extra)
1212 {
1213   bin_tree_t *node, *prev;
1214
1215   for (node = root; ; )
1216     {
1217       /* Descend down the tree, preferably to the left (or to the right
1218          if that's the only child).  */
1219       while (node->left || node->right)
1220         if (node->left)
1221           node = node->left;
1222         else
1223           node = node->right;
1224
1225       do
1226         {
1227           reg_errcode_t err = fn (extra, node);
1228           if (BE (err != REG_NOERROR, 0))
1229             return err;
1230           if (node->parent == NULL)
1231             return REG_NOERROR;
1232           prev = node;
1233           node = node->parent;
1234         }
1235       /* Go up while we have a node that is reached from the right.  */
1236       while (node->right == prev || node->right == NULL);
1237       node = node->right;
1238     }
1239 }
1240
1241 static reg_errcode_t
1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1243           void *extra)
1244 {
1245   bin_tree_t *node;
1246
1247   for (node = root; ; )
1248     {
1249       reg_errcode_t err = fn (extra, node);
1250       if (BE (err != REG_NOERROR, 0))
1251         return err;
1252
1253       /* Go to the left node, or up and to the right.  */
1254       if (node->left)
1255         node = node->left;
1256       else
1257         {
1258           bin_tree_t *prev = NULL;
1259           while (node->right == prev || node->right == NULL)
1260             {
1261               prev = node;
1262               node = node->parent;
1263               if (!node)
1264                 return REG_NOERROR;
1265             }
1266           node = node->right;
1267         }
1268     }
1269 }
1270
1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1272    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1273    backreferences as well.  Requires a preorder visit.  */
1274 static reg_errcode_t
1275 optimize_subexps (void *extra, bin_tree_t *node)
1276 {
1277   re_dfa_t *dfa = (re_dfa_t *) extra;
1278
1279   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280     {
1281       int idx = node->token.opr.idx;
1282       node->token.opr.idx = dfa->subexp_map[idx];
1283       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1284     }
1285
1286   else if (node->token.type == SUBEXP
1287            && node->left && node->left->token.type == SUBEXP)
1288     {
1289       Idx other_idx = node->left->token.opr.idx;
1290
1291       node->left = node->left->left;
1292       if (node->left)
1293         node->left->parent = node;
1294
1295       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1296       if (other_idx < BITSET_WORD_BITS)
1297         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1298     }
1299
1300   return REG_NOERROR;
1301 }
1302
1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1304    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1305 static reg_errcode_t
1306 lower_subexps (void *extra, bin_tree_t *node)
1307 {
1308   regex_t *preg = (regex_t *) extra;
1309   reg_errcode_t err = REG_NOERROR;
1310
1311   if (node->left && node->left->token.type == SUBEXP)
1312     {
1313       node->left = lower_subexp (&err, preg, node->left);
1314       if (node->left)
1315         node->left->parent = node;
1316     }
1317   if (node->right && node->right->token.type == SUBEXP)
1318     {
1319       node->right = lower_subexp (&err, preg, node->right);
1320       if (node->right)
1321         node->right->parent = node;
1322     }
1323
1324   return err;
1325 }
1326
1327 static bin_tree_t *
1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 {
1330   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331   bin_tree_t *body = node->left;
1332   bin_tree_t *op, *cls, *tree1, *tree;
1333
1334   if (preg->no_sub
1335       /* We do not optimize empty subexpressions, because otherwise we may
1336          have bad CONCAT nodes with NULL children.  This is obviously not
1337          very common, so we do not lose much.  An example that triggers
1338          this case is the sed "script" /\(\)/x.  */
1339       && node->left != NULL
1340       && (node->token.opr.idx >= BITSET_WORD_BITS
1341           || !(dfa->used_bkref_map
1342                & ((bitset_word_t) 1 << node->token.opr.idx))))
1343     return node->left;
1344
1345   /* Convert the SUBEXP node to the concatenation of an
1346      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1347   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1348   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1349   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1350   tree = create_tree (dfa, op, tree1, CONCAT);
1351   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1352     {
1353       *err = REG_ESPACE;
1354       return NULL;
1355     }
1356
1357   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1358   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1359   return tree;
1360 }
1361
1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1363    nodes.  Requires a postorder visit.  */
1364 static reg_errcode_t
1365 calc_first (void *extra, bin_tree_t *node)
1366 {
1367   re_dfa_t *dfa = (re_dfa_t *) extra;
1368   if (node->token.type == CONCAT)
1369     {
1370       node->first = node->left->first;
1371       node->node_idx = node->left->node_idx;
1372     }
1373   else
1374     {
1375       node->first = node;
1376       node->node_idx = re_dfa_add_node (dfa, node->token);
1377       if (BE (node->node_idx == REG_MISSING, 0))
1378         return REG_ESPACE;
1379       if (node->token.type == ANCHOR)
1380         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381     }
1382   return REG_NOERROR;
1383 }
1384
1385 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1386 static reg_errcode_t
1387 calc_next (void *extra, bin_tree_t *node)
1388 {
1389   switch (node->token.type)
1390     {
1391     case OP_DUP_ASTERISK:
1392       node->left->next = node;
1393       break;
1394     case CONCAT:
1395       node->left->next = node->right->first;
1396       node->right->next = node->next;
1397       break;
1398     default:
1399       if (node->left)
1400         node->left->next = node->next;
1401       if (node->right)
1402         node->right->next = node->next;
1403       break;
1404     }
1405   return REG_NOERROR;
1406 }
1407
1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1409 static reg_errcode_t
1410 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 {
1412   re_dfa_t *dfa = (re_dfa_t *) extra;
1413   Idx idx = node->node_idx;
1414   reg_errcode_t err = REG_NOERROR;
1415
1416   switch (node->token.type)
1417     {
1418     case CONCAT:
1419       break;
1420
1421     case END_OF_RE:
1422       assert (node->next == NULL);
1423       break;
1424
1425     case OP_DUP_ASTERISK:
1426     case OP_ALT:
1427       {
1428         Idx left, right;
1429         dfa->has_plural_match = 1;
1430         if (node->left != NULL)
1431           left = node->left->first->node_idx;
1432         else
1433           left = node->next->node_idx;
1434         if (node->right != NULL)
1435           right = node->right->first->node_idx;
1436         else
1437           right = node->next->node_idx;
1438         assert (REG_VALID_INDEX (left));
1439         assert (REG_VALID_INDEX (right));
1440         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441       }
1442       break;
1443
1444     case ANCHOR:
1445     case OP_OPEN_SUBEXP:
1446     case OP_CLOSE_SUBEXP:
1447       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1448       break;
1449
1450     case OP_BACK_REF:
1451       dfa->nexts[idx] = node->next->node_idx;
1452       if (node->token.type == OP_BACK_REF)
1453         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1454       break;
1455
1456     default:
1457       assert (!IS_EPSILON_NODE (node->token.type));
1458       dfa->nexts[idx] = node->next->node_idx;
1459       break;
1460     }
1461
1462   return err;
1463 }
1464
1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
1466    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1467    to their own constraint.  */
1468
1469 static reg_errcode_t
1470 internal_function
1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1472                         Idx root_node, unsigned int init_constraint)
1473 {
1474   Idx org_node, clone_node;
1475   bool ok;
1476   unsigned int constraint = init_constraint;
1477   for (org_node = top_org_node, clone_node = top_clone_node;;)
1478     {
1479       Idx org_dest, clone_dest;
1480       if (dfa->nodes[org_node].type == OP_BACK_REF)
1481         {
1482           /* If the back reference epsilon-transit, its destination must
1483              also have the constraint.  Then duplicate the epsilon closure
1484              of the destination of the back reference, and store it in
1485              edests of the back reference.  */
1486           org_dest = dfa->nexts[org_node];
1487           re_node_set_empty (dfa->edests + clone_node);
1488           clone_dest = duplicate_node (dfa, org_dest, constraint);
1489           if (BE (clone_dest == REG_MISSING, 0))
1490             return REG_ESPACE;
1491           dfa->nexts[clone_node] = dfa->nexts[org_node];
1492           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493           if (BE (! ok, 0))
1494             return REG_ESPACE;
1495         }
1496       else if (dfa->edests[org_node].nelem == 0)
1497         {
1498           /* In case of the node can't epsilon-transit, don't duplicate the
1499              destination and store the original destination as the
1500              destination of the node.  */
1501           dfa->nexts[clone_node] = dfa->nexts[org_node];
1502           break;
1503         }
1504       else if (dfa->edests[org_node].nelem == 1)
1505         {
1506           /* In case of the node can epsilon-transit, and it has only one
1507              destination.  */
1508           org_dest = dfa->edests[org_node].elems[0];
1509           re_node_set_empty (dfa->edests + clone_node);
1510           /* If the node is root_node itself, it means the epsilon closure
1511              has a loop.  Then tie it to the destination of the root_node.  */
1512           if (org_node == root_node && clone_node != org_node)
1513             {
1514               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515               if (BE (! ok, 0))
1516                 return REG_ESPACE;
1517               break;
1518             }
1519           /* In case the node has another constraint, append it.  */
1520           constraint |= dfa->nodes[org_node].constraint;
1521           clone_dest = duplicate_node (dfa, org_dest, constraint);
1522           if (BE (clone_dest == REG_MISSING, 0))
1523             return REG_ESPACE;
1524           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525           if (BE (! ok, 0))
1526             return REG_ESPACE;
1527         }
1528       else /* dfa->edests[org_node].nelem == 2 */
1529         {
1530           /* In case of the node can epsilon-transit, and it has two
1531              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1532           org_dest = dfa->edests[org_node].elems[0];
1533           re_node_set_empty (dfa->edests + clone_node);
1534           /* Search for a duplicated node which satisfies the constraint.  */
1535           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536           if (clone_dest == REG_MISSING)
1537             {
1538               /* There is no such duplicated node, create a new one.  */
1539               reg_errcode_t err;
1540               clone_dest = duplicate_node (dfa, org_dest, constraint);
1541               if (BE (clone_dest == REG_MISSING, 0))
1542                 return REG_ESPACE;
1543               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (! ok, 0))
1545                 return REG_ESPACE;
1546               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547                                             root_node, constraint);
1548               if (BE (err != REG_NOERROR, 0))
1549                 return err;
1550             }
1551           else
1552             {
1553               /* There is a duplicated node which satisfies the constraint,
1554                  use it to avoid infinite loop.  */
1555               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556               if (BE (! ok, 0))
1557                 return REG_ESPACE;
1558             }
1559
1560           org_dest = dfa->edests[org_node].elems[1];
1561           clone_dest = duplicate_node (dfa, org_dest, constraint);
1562           if (BE (clone_dest == REG_MISSING, 0))
1563             return REG_ESPACE;
1564           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565           if (BE (! ok, 0))
1566             return REG_ESPACE;
1567         }
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576
1577 static Idx
1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1579                         unsigned int constraint)
1580 {
1581   Idx idx;
1582   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583     {
1584       if (org_node == dfa->org_indices[idx]
1585           && constraint == dfa->nodes[idx].constraint)
1586         return idx; /* Found.  */
1587     }
1588   return REG_MISSING; /* Not found.  */
1589 }
1590
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592    Return the index of the new node, or REG_MISSING if insufficient storage is
1593    available.  */
1594
1595 static Idx
1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597 {
1598   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599   if (BE (dup_idx != REG_MISSING, 1))
1600     {
1601       dfa->nodes[dup_idx].constraint = constraint;
1602       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603       dfa->nodes[dup_idx].duplicated = 1;
1604
1605       /* Store the index of the original node.  */
1606       dfa->org_indices[dup_idx] = org_idx;
1607     }
1608   return dup_idx;
1609 }
1610
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1613 {
1614   Idx src, idx;
1615   bool ok;
1616   for (idx = 0; idx < dfa->nodes_len; ++idx)
1617     re_node_set_init_empty (dfa->inveclosures + idx);
1618
1619   for (src = 0; src < dfa->nodes_len; ++src)
1620     {
1621       Idx *elems = dfa->eclosures[src].elems;
1622       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623         {
1624           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625           if (BE (! ok, 0))
1626             return REG_ESPACE;
1627         }
1628     }
1629
1630   return REG_NOERROR;
1631 }
1632
1633 /* Calculate "eclosure" for all the node in DFA.  */
1634
1635 static reg_errcode_t
1636 calc_eclosure (re_dfa_t *dfa)
1637 {
1638   Idx node_idx;
1639   bool incomplete;
1640 #ifdef DEBUG
1641   assert (dfa->nodes_len > 0);
1642 #endif
1643   incomplete = false;
1644   /* For each nodes, calculate epsilon closure.  */
1645   for (node_idx = 0; ; ++node_idx)
1646     {
1647       reg_errcode_t err;
1648       re_node_set eclosure_elem;
1649       if (node_idx == dfa->nodes_len)
1650         {
1651           if (!incomplete)
1652             break;
1653           incomplete = false;
1654           node_idx = 0;
1655         }
1656
1657 #ifdef DEBUG
1658       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659 #endif
1660
1661       /* If we have already calculated, skip it.  */
1662       if (dfa->eclosures[node_idx].nelem != 0)
1663         continue;
1664       /* Calculate epsilon closure of `node_idx'.  */
1665       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666       if (BE (err != REG_NOERROR, 0))
1667         return err;
1668
1669       if (dfa->eclosures[node_idx].nelem == 0)
1670         {
1671           incomplete = true;
1672           re_node_set_free (&eclosure_elem);
1673         }
1674     }
1675   return REG_NOERROR;
1676 }
1677
1678 /* Calculate epsilon closure of NODE.  */
1679
1680 static reg_errcode_t
1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1682 {
1683   reg_errcode_t err;
1684   Idx i;
1685   re_node_set eclosure;
1686   bool ok;
1687   bool incomplete = false;
1688   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689   if (BE (err != REG_NOERROR, 0))
1690     return err;
1691
1692   /* This indicates that we are calculating this node now.
1693      We reference this value to avoid infinite loop.  */
1694   dfa->eclosures[node].nelem = REG_MISSING;
1695
1696   /* If the current node has constraints, duplicate all nodes
1697      since they must inherit the constraints.  */
1698   if (dfa->nodes[node].constraint
1699       && dfa->edests[node].nelem
1700       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701     {
1702       err = duplicate_node_closure (dfa, node, node, node,
1703                                     dfa->nodes[node].constraint);
1704       if (BE (err != REG_NOERROR, 0))
1705         return err;
1706     }
1707
1708   /* Expand each epsilon destination nodes.  */
1709   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710     for (i = 0; i < dfa->edests[node].nelem; ++i)
1711       {
1712         re_node_set eclosure_elem;
1713         Idx edest = dfa->edests[node].elems[i];
1714         /* If calculating the epsilon closure of `edest' is in progress,
1715            return intermediate result.  */
1716         if (dfa->eclosures[edest].nelem == REG_MISSING)
1717           {
1718             incomplete = true;
1719             continue;
1720           }
1721         /* If we haven't calculated the epsilon closure of `edest' yet,
1722            calculate now. Otherwise use calculated epsilon closure.  */
1723         if (dfa->eclosures[edest].nelem == 0)
1724           {
1725             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1726             if (BE (err != REG_NOERROR, 0))
1727               return err;
1728           }
1729         else
1730           eclosure_elem = dfa->eclosures[edest];
1731         /* Merge the epsilon closure of `edest'.  */
1732         err = re_node_set_merge (&eclosure, &eclosure_elem);
1733         if (BE (err != REG_NOERROR, 0))
1734           return err;
1735         /* If the epsilon closure of `edest' is incomplete,
1736            the epsilon closure of this node is also incomplete.  */
1737         if (dfa->eclosures[edest].nelem == 0)
1738           {
1739             incomplete = true;
1740             re_node_set_free (&eclosure_elem);
1741           }
1742       }
1743
1744   /* An epsilon closure includes itself.  */
1745   ok = re_node_set_insert (&eclosure, node);
1746   if (BE (! ok, 0))
1747     return REG_ESPACE;
1748   if (incomplete && !root)
1749     dfa->eclosures[node].nelem = 0;
1750   else
1751     dfa->eclosures[node] = eclosure;
1752   *new_set = eclosure;
1753   return REG_NOERROR;
1754 }
1755 \f
1756 /* Functions for token which are used in the parser.  */
1757
1758 /* Fetch a token from INPUT.
1759    We must not use this function inside bracket expressions.  */
1760
1761 static void
1762 internal_function
1763 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764 {
1765   re_string_skip_bytes (input, peek_token (result, input, syntax));
1766 }
1767
1768 /* Peek a token from INPUT, and return the length of the token.
1769    We must not use this function inside bracket expressions.  */
1770
1771 static int
1772 internal_function
1773 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774 {
1775   unsigned char c;
1776
1777   if (re_string_eoi (input))
1778     {
1779       token->type = END_OF_RE;
1780       return 0;
1781     }
1782
1783   c = re_string_peek_byte (input, 0);
1784   token->opr.c = c;
1785
1786   token->word_char = 0;
1787 #ifdef RE_ENABLE_I18N
1788   token->mb_partial = 0;
1789   if (input->mb_cur_max > 1 &&
1790       !re_string_first_byte (input, re_string_cur_idx (input)))
1791     {
1792       token->type = CHARACTER;
1793       token->mb_partial = 1;
1794       return 1;
1795     }
1796 #endif
1797   if (c == '\\')
1798     {
1799       unsigned char c2;
1800       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801         {
1802           token->type = BACK_SLASH;
1803           return 1;
1804         }
1805
1806       c2 = re_string_peek_byte_case (input, 1);
1807       token->opr.c = c2;
1808       token->type = CHARACTER;
1809 #ifdef RE_ENABLE_I18N
1810       if (input->mb_cur_max > 1)
1811         {
1812           wint_t wc = re_string_wchar_at (input,
1813                                           re_string_cur_idx (input) + 1);
1814           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815         }
1816       else
1817 #endif
1818         token->word_char = IS_WORD_CHAR (c2) != 0;
1819
1820       switch (c2)
1821         {
1822         case '|':
1823           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824             token->type = OP_ALT;
1825           break;
1826         case '1': case '2': case '3': case '4': case '5':
1827         case '6': case '7': case '8': case '9':
1828           if (!(syntax & RE_NO_BK_REFS))
1829             {
1830               token->type = OP_BACK_REF;
1831               token->opr.idx = c2 - '1';
1832             }
1833           break;
1834         case '<':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             {
1837               token->type = ANCHOR;
1838               token->opr.ctx_type = WORD_FIRST;
1839             }
1840           break;
1841         case '>':
1842           if (!(syntax & RE_NO_GNU_OPS))
1843             {
1844               token->type = ANCHOR;
1845               token->opr.ctx_type = WORD_LAST;
1846             }
1847           break;
1848         case 'b':
1849           if (!(syntax & RE_NO_GNU_OPS))
1850             {
1851               token->type = ANCHOR;
1852               token->opr.ctx_type = WORD_DELIM;
1853             }
1854           break;
1855         case 'B':
1856           if (!(syntax & RE_NO_GNU_OPS))
1857             {
1858               token->type = ANCHOR;
1859               token->opr.ctx_type = NOT_WORD_DELIM;
1860             }
1861           break;
1862         case 'w':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_WORD;
1865           break;
1866         case 'W':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_NOTWORD;
1869           break;
1870         case 's':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             token->type = OP_SPACE;
1873           break;
1874         case 'S':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_NOTSPACE;
1877           break;
1878         case '`':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = BUF_FIRST;
1883             }
1884           break;
1885         case '\'':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             {
1888               token->type = ANCHOR;
1889               token->opr.ctx_type = BUF_LAST;
1890             }
1891           break;
1892         case '(':
1893           if (!(syntax & RE_NO_BK_PARENS))
1894             token->type = OP_OPEN_SUBEXP;
1895           break;
1896         case ')':
1897           if (!(syntax & RE_NO_BK_PARENS))
1898             token->type = OP_CLOSE_SUBEXP;
1899           break;
1900         case '+':
1901           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902             token->type = OP_DUP_PLUS;
1903           break;
1904         case '?':
1905           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906             token->type = OP_DUP_QUESTION;
1907           break;
1908         case '{':
1909           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910             token->type = OP_OPEN_DUP_NUM;
1911           break;
1912         case '}':
1913           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914             token->type = OP_CLOSE_DUP_NUM;
1915           break;
1916         default:
1917           break;
1918         }
1919       return 2;
1920     }
1921
1922   token->type = CHARACTER;
1923 #ifdef RE_ENABLE_I18N
1924   if (input->mb_cur_max > 1)
1925     {
1926       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928     }
1929   else
1930 #endif
1931     token->word_char = IS_WORD_CHAR (token->opr.c);
1932
1933   switch (c)
1934     {
1935     case '\n':
1936       if (syntax & RE_NEWLINE_ALT)
1937         token->type = OP_ALT;
1938       break;
1939     case '|':
1940       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941         token->type = OP_ALT;
1942       break;
1943     case '*':
1944       token->type = OP_DUP_ASTERISK;
1945       break;
1946     case '+':
1947       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948         token->type = OP_DUP_PLUS;
1949       break;
1950     case '?':
1951       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952         token->type = OP_DUP_QUESTION;
1953       break;
1954     case '{':
1955       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956         token->type = OP_OPEN_DUP_NUM;
1957       break;
1958     case '}':
1959       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960         token->type = OP_CLOSE_DUP_NUM;
1961       break;
1962     case '(':
1963       if (syntax & RE_NO_BK_PARENS)
1964         token->type = OP_OPEN_SUBEXP;
1965       break;
1966     case ')':
1967       if (syntax & RE_NO_BK_PARENS)
1968         token->type = OP_CLOSE_SUBEXP;
1969       break;
1970     case '[':
1971       token->type = OP_OPEN_BRACKET;
1972       break;
1973     case '.':
1974       token->type = OP_PERIOD;
1975       break;
1976     case '^':
1977       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978           re_string_cur_idx (input) != 0)
1979         {
1980           char prev = re_string_peek_byte (input, -1);
1981           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982             break;
1983         }
1984       token->type = ANCHOR;
1985       token->opr.ctx_type = LINE_FIRST;
1986       break;
1987     case '$':
1988       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989           re_string_cur_idx (input) + 1 != re_string_length (input))
1990         {
1991           re_token_t next;
1992           re_string_skip_bytes (input, 1);
1993           peek_token (&next, input, syntax);
1994           re_string_skip_bytes (input, -1);
1995           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996             break;
1997         }
1998       token->type = ANCHOR;
1999       token->opr.ctx_type = LINE_LAST;
2000       break;
2001     default:
2002       break;
2003     }
2004   return 1;
2005 }
2006
2007 /* Peek a token from INPUT, and return the length of the token.
2008    We must not use this function out of bracket expressions.  */
2009
2010 static int
2011 internal_function
2012 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013 {
2014   unsigned char c;
2015   if (re_string_eoi (input))
2016     {
2017       token->type = END_OF_RE;
2018       return 0;
2019     }
2020   c = re_string_peek_byte (input, 0);
2021   token->opr.c = c;
2022
2023 #ifdef RE_ENABLE_I18N
2024   if (input->mb_cur_max > 1 &&
2025       !re_string_first_byte (input, re_string_cur_idx (input)))
2026     {
2027       token->type = CHARACTER;
2028       return 1;
2029     }
2030 #endif /* RE_ENABLE_I18N */
2031
2032   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033       && re_string_cur_idx (input) + 1 < re_string_length (input))
2034     {
2035       /* In this case, '\' escape a character.  */
2036       unsigned char c2;
2037       re_string_skip_bytes (input, 1);
2038       c2 = re_string_peek_byte (input, 0);
2039       token->opr.c = c2;
2040       token->type = CHARACTER;
2041       return 1;
2042     }
2043   if (c == '[') /* '[' is a special char in a bracket exps.  */
2044     {
2045       unsigned char c2;
2046       int token_len;
2047       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048         c2 = re_string_peek_byte (input, 1);
2049       else
2050         c2 = 0;
2051       token->opr.c = c2;
2052       token_len = 2;
2053       switch (c2)
2054         {
2055         case '.':
2056           token->type = OP_OPEN_COLL_ELEM;
2057           break;
2058         case '=':
2059           token->type = OP_OPEN_EQUIV_CLASS;
2060           break;
2061         case ':':
2062           if (syntax & RE_CHAR_CLASSES)
2063             {
2064               token->type = OP_OPEN_CHAR_CLASS;
2065               break;
2066             }
2067           /* else fall through.  */
2068         default:
2069           token->type = CHARACTER;
2070           token->opr.c = c;
2071           token_len = 1;
2072           break;
2073         }
2074       return token_len;
2075     }
2076   switch (c)
2077     {
2078     case '-':
2079       token->type = OP_CHARSET_RANGE;
2080       break;
2081     case ']':
2082       token->type = OP_CLOSE_BRACKET;
2083       break;
2084     case '^':
2085       token->type = OP_NON_MATCH_LIST;
2086       break;
2087     default:
2088       token->type = CHARACTER;
2089     }
2090   return 1;
2091 }
2092 \f
2093 /* Functions for parser.  */
2094
2095 /* Entry point of the parser.
2096    Parse the regular expression REGEXP and return the structure tree.
2097    If an error is occured, ERR is set by error code, and return NULL.
2098    This function build the following tree, from regular expression <reg_exp>:
2099            CAT
2100            / \
2101           /   \
2102    <reg_exp>  EOR
2103
2104    CAT means concatenation.
2105    EOR means end of regular expression.  */
2106
2107 static bin_tree_t *
2108 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109        reg_errcode_t *err)
2110 {
2111   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112   bin_tree_t *tree, *eor, *root;
2113   re_token_t current_token;
2114   dfa->syntax = syntax;
2115   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2117   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118     return NULL;
2119   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120   if (tree != NULL)
2121     root = create_tree (dfa, tree, eor, CONCAT);
2122   else
2123     root = eor;
2124   if (BE (eor == NULL || root == NULL, 0))
2125     {
2126       *err = REG_ESPACE;
2127       return NULL;
2128     }
2129   return root;
2130 }
2131
2132 /* This function build the following tree, from regular expression
2133    <branch1>|<branch2>:
2134            ALT
2135            / \
2136           /   \
2137    <branch1> <branch2>
2138
2139    ALT means alternative, which represents the operator `|'.  */
2140
2141 static bin_tree_t *
2142 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144 {
2145   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146   bin_tree_t *tree, *branch = NULL;
2147   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150
2151   while (token->type == OP_ALT)
2152     {
2153       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154       if (token->type != OP_ALT && token->type != END_OF_RE
2155           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156         {
2157           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159             return NULL;
2160         }
2161       else
2162         branch = NULL;
2163       tree = create_tree (dfa, tree, branch, OP_ALT);
2164       if (BE (tree == NULL, 0))
2165         {
2166           *err = REG_ESPACE;
2167           return NULL;
2168         }
2169     }
2170   return tree;
2171 }
2172
2173 /* This function build the following tree, from regular expression
2174    <exp1><exp2>:
2175         CAT
2176         / \
2177        /   \
2178    <exp1> <exp2>
2179
2180    CAT means concatenation.  */
2181
2182 static bin_tree_t *
2183 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185 {
2186   bin_tree_t *tree, *expr;
2187   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190     return NULL;
2191
2192   while (token->type != OP_ALT && token->type != END_OF_RE
2193          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194     {
2195       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2196       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2197         {
2198           return NULL;
2199         }
2200       if (tree != NULL && expr != NULL)
2201         {
2202           tree = create_tree (dfa, tree, expr, CONCAT);
2203           if (tree == NULL)
2204             {
2205               *err = REG_ESPACE;
2206               return NULL;
2207             }
2208         }
2209       else if (tree == NULL)
2210         tree = expr;
2211       /* Otherwise expr == NULL, we don't need to create new tree.  */
2212     }
2213   return tree;
2214 }
2215
2216 /* This function build the following tree, from regular expression a*:
2217          *
2218          |
2219          a
2220 */
2221
2222 static bin_tree_t *
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225 {
2226   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227   bin_tree_t *tree;
2228   switch (token->type)
2229     {
2230     case CHARACTER:
2231       tree = create_token_tree (dfa, NULL, NULL, token);
2232       if (BE (tree == NULL, 0))
2233         {
2234           *err = REG_ESPACE;
2235           return NULL;
2236         }
2237 #ifdef RE_ENABLE_I18N
2238       if (dfa->mb_cur_max > 1)
2239         {
2240           while (!re_string_eoi (regexp)
2241                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242             {
2243               bin_tree_t *mbc_remain;
2244               fetch_token (token, regexp, syntax);
2245               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247               if (BE (mbc_remain == NULL || tree == NULL, 0))
2248                 {
2249                   *err = REG_ESPACE;
2250                   return NULL;
2251                 }
2252             }
2253         }
2254 #endif
2255       break;
2256     case OP_OPEN_SUBEXP:
2257       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259         return NULL;
2260       break;
2261     case OP_OPEN_BRACKET:
2262       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264         return NULL;
2265       break;
2266     case OP_BACK_REF:
2267       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268         {
2269           *err = REG_ESUBREG;
2270           return NULL;
2271         }
2272       dfa->used_bkref_map |= 1 << token->opr.idx;
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275         {
2276           *err = REG_ESPACE;
2277           return NULL;
2278         }
2279       ++dfa->nbackref;
2280       dfa->has_mb_node = 1;
2281       break;
2282     case OP_OPEN_DUP_NUM:
2283       if (syntax & RE_CONTEXT_INVALID_DUP)
2284         {
2285           *err = REG_BADRPT;
2286           return NULL;
2287         }
2288       /* FALLTHROUGH */
2289     case OP_DUP_ASTERISK:
2290     case OP_DUP_PLUS:
2291     case OP_DUP_QUESTION:
2292       if (syntax & RE_CONTEXT_INVALID_OPS)
2293         {
2294           *err = REG_BADRPT;
2295           return NULL;
2296         }
2297       else if (syntax & RE_CONTEXT_INDEP_OPS)
2298         {
2299           fetch_token (token, regexp, syntax);
2300           return parse_expression (regexp, preg, token, syntax, nest, err);
2301         }
2302       /* else fall through  */
2303     case OP_CLOSE_SUBEXP:
2304       if ((token->type == OP_CLOSE_SUBEXP) &&
2305           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306         {
2307           *err = REG_ERPAREN;
2308           return NULL;
2309         }
2310       /* else fall through  */
2311     case OP_CLOSE_DUP_NUM:
2312       /* We treat it as a normal character.  */
2313
2314       /* Then we can these characters as normal characters.  */
2315       token->type = CHARACTER;
2316       /* mb_partial and word_char bits should be initialized already
2317          by peek_token.  */
2318       tree = create_token_tree (dfa, NULL, NULL, token);
2319       if (BE (tree == NULL, 0))
2320         {
2321           *err = REG_ESPACE;
2322           return NULL;
2323         }
2324       break;
2325     case ANCHOR:
2326       if ((token->opr.ctx_type
2327            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328           && dfa->word_ops_used == 0)
2329         init_word_char (dfa);
2330       if (token->opr.ctx_type == WORD_DELIM
2331           || token->opr.ctx_type == NOT_WORD_DELIM)
2332         {
2333           bin_tree_t *tree_first, *tree_last;
2334           if (token->opr.ctx_type == WORD_DELIM)
2335             {
2336               token->opr.ctx_type = WORD_FIRST;
2337               tree_first = create_token_tree (dfa, NULL, NULL, token);
2338               token->opr.ctx_type = WORD_LAST;
2339             }
2340           else
2341             {
2342               token->opr.ctx_type = INSIDE_WORD;
2343               tree_first = create_token_tree (dfa, NULL, NULL, token);
2344               token->opr.ctx_type = INSIDE_NOTWORD;
2345             }
2346           tree_last = create_token_tree (dfa, NULL, NULL, token);
2347           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349             {
2350               *err = REG_ESPACE;
2351               return NULL;
2352             }
2353         }
2354       else
2355         {
2356           tree = create_token_tree (dfa, NULL, NULL, token);
2357           if (BE (tree == NULL, 0))
2358             {
2359               *err = REG_ESPACE;
2360               return NULL;
2361             }
2362         }
2363       /* We must return here, since ANCHORs can't be followed
2364          by repetition operators.
2365          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367       fetch_token (token, regexp, syntax);
2368       return tree;
2369     case OP_PERIOD:
2370       tree = create_token_tree (dfa, NULL, NULL, token);
2371       if (BE (tree == NULL, 0))
2372         {
2373           *err = REG_ESPACE;
2374           return NULL;
2375         }
2376       if (dfa->mb_cur_max > 1)
2377         dfa->has_mb_node = 1;
2378       break;
2379     case OP_WORD:
2380     case OP_NOTWORD:
2381       tree = build_charclass_op (dfa, regexp->trans,
2382                                  (const unsigned char *) "alnum",
2383                                  (const unsigned char *) "_",
2384                                  token->type == OP_NOTWORD, err);
2385       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386         return NULL;
2387       break;
2388     case OP_SPACE:
2389     case OP_NOTSPACE:
2390       tree = build_charclass_op (dfa, regexp->trans,
2391                                  (const unsigned char *) "space",
2392                                  (const unsigned char *) "",
2393                                  token->type == OP_NOTSPACE, err);
2394       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395         return NULL;
2396       break;
2397     case OP_ALT:
2398     case END_OF_RE:
2399       return NULL;
2400     case BACK_SLASH:
2401       *err = REG_EESCAPE;
2402       return NULL;
2403     default:
2404       /* Must not happen?  */
2405 #ifdef DEBUG
2406       assert (0);
2407 #endif
2408       return NULL;
2409     }
2410   fetch_token (token, regexp, syntax);
2411
2412   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414     {
2415       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417         return NULL;
2418       /* In BRE consecutive duplications are not allowed.  */
2419       if ((syntax & RE_CONTEXT_INVALID_DUP)
2420           && (token->type == OP_DUP_ASTERISK
2421               || token->type == OP_OPEN_DUP_NUM))
2422         {
2423           *err = REG_BADRPT;
2424           return NULL;
2425         }
2426     }
2427
2428   return tree;
2429 }
2430
2431 /* This function build the following tree, from regular expression
2432    (<reg_exp>):
2433          SUBEXP
2434             |
2435         <reg_exp>
2436 */
2437
2438 static bin_tree_t *
2439 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441 {
2442   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443   bin_tree_t *tree;
2444   size_t cur_nsub;
2445   cur_nsub = preg->re_nsub++;
2446
2447   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
2449   /* The subexpression may be a null string.  */
2450   if (token->type == OP_CLOSE_SUBEXP)
2451     tree = NULL;
2452   else
2453     {
2454       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456         *err = REG_EPAREN;
2457       if (BE (*err != REG_NOERROR, 0))
2458         return NULL;
2459     }
2460
2461   if (cur_nsub <= '9' - '1')
2462     dfa->completed_bkref_map |= 1 << cur_nsub;
2463
2464   tree = create_tree (dfa, tree, NULL, SUBEXP);
2465   if (BE (tree == NULL, 0))
2466     {
2467       *err = REG_ESPACE;
2468       return NULL;
2469     }
2470   tree->token.opr.idx = cur_nsub;
2471   return tree;
2472 }
2473
2474 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2475
2476 static bin_tree_t *
2477 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479 {
2480   bin_tree_t *tree = NULL, *old_tree = NULL;
2481   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2482   re_token_t start_token = *token;
2483
2484   if (token->type == OP_OPEN_DUP_NUM)
2485     {
2486       end = 0;
2487       start = fetch_number (regexp, token, syntax);
2488       if (start == REG_MISSING)
2489         {
2490           if (token->type == CHARACTER && token->opr.c == ',')
2491             start = 0; /* We treat "{,m}" as "{0,m}".  */
2492           else
2493             {
2494               *err = REG_BADBR; /* <re>{} is invalid.  */
2495               return NULL;
2496             }
2497         }
2498       if (BE (start != REG_ERROR, 1))
2499         {
2500           /* We treat "{n}" as "{n,n}".  */
2501           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2502                  : ((token->type == CHARACTER && token->opr.c == ',')
2503                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504         }
2505       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506         {
2507           /* Invalid sequence.  */
2508           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509             {
2510               if (token->type == END_OF_RE)
2511                 *err = REG_EBRACE;
2512               else
2513                 *err = REG_BADBR;
2514
2515               return NULL;
2516             }
2517
2518           /* If the syntax bit is set, rollback.  */
2519           re_string_set_index (regexp, start_idx);
2520           *token = start_token;
2521           token->type = CHARACTER;
2522           /* mb_partial and word_char bits should be already initialized by
2523              peek_token.  */
2524           return elem;
2525         }
2526
2527       if (BE ((end != REG_MISSING && start > end)
2528               || token->type != OP_CLOSE_DUP_NUM, 0))
2529         {
2530           /* First number greater than second.  */
2531           *err = REG_BADBR;
2532           return NULL;
2533         }
2534     }
2535   else
2536     {
2537       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2539     }
2540
2541   fetch_token (token, regexp, syntax);
2542
2543   if (BE (elem == NULL, 0))
2544     return NULL;
2545   if (BE (start == 0 && end == 0, 0))
2546     {
2547       postorder (elem, free_tree, NULL);
2548       return NULL;
2549     }
2550
2551   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2552   if (BE (start > 0, 0))
2553     {
2554       tree = elem;
2555       for (i = 2; i <= start; ++i)
2556         {
2557           elem = duplicate_tree (elem, dfa);
2558           tree = create_tree (dfa, tree, elem, CONCAT);
2559           if (BE (elem == NULL || tree == NULL, 0))
2560             goto parse_dup_op_espace;
2561         }
2562
2563       if (start == end)
2564         return tree;
2565
2566       /* Duplicate ELEM before it is marked optional.  */
2567       elem = duplicate_tree (elem, dfa);
2568       old_tree = tree;
2569     }
2570   else
2571     old_tree = NULL;
2572
2573   if (elem->token.type == SUBEXP)
2574     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
2576   tree = create_tree (dfa, elem, NULL,
2577                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2578   if (BE (tree == NULL, 0))
2579     goto parse_dup_op_espace;
2580
2581 /* From gnulib's "intprops.h":
2582    True if the arithmetic type T is signed.  */
2583 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584
2585   /* This loop is actually executed only when end != REG_MISSING,
2586      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2587      already created the start+1-th copy.  */
2588   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2589     for (i = start + 2; i <= end; ++i)
2590       {
2591         elem = duplicate_tree (elem, dfa);
2592         tree = create_tree (dfa, tree, elem, CONCAT);
2593         if (BE (elem == NULL || tree == NULL, 0))
2594           goto parse_dup_op_espace;
2595
2596         tree = create_tree (dfa, tree, NULL, OP_ALT);
2597         if (BE (tree == NULL, 0))
2598           goto parse_dup_op_espace;
2599       }
2600
2601   if (old_tree)
2602     tree = create_tree (dfa, old_tree, tree, CONCAT);
2603
2604   return tree;
2605
2606  parse_dup_op_espace:
2607   *err = REG_ESPACE;
2608   return NULL;
2609 }
2610
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612    I'm not sure, but maybe enough.  */
2613 #define BRACKET_NAME_BUF_SIZE 32
2614
2615 #ifndef _LIBC
2616   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617      Build the range expression which starts from START_ELEM, and ends
2618      at END_ELEM.  The result are written to MBCSET and SBCSET.
2619      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620      mbcset->range_ends, is a pointer argument sinse we may
2621      update it.  */
2622
2623 static reg_errcode_t
2624 internal_function
2625 # ifdef RE_ENABLE_I18N
2626 build_range_exp (const reg_syntax_t syntax,
2627                  bitset_t sbcset,
2628                  re_charset_t *mbcset,
2629                  Idx *range_alloc,
2630                  const bracket_elem_t *start_elem,
2631                  const bracket_elem_t *end_elem)
2632 # else /* not RE_ENABLE_I18N */
2633 build_range_exp (const reg_syntax_t syntax,
2634                  bitset_t sbcset,
2635                  const bracket_elem_t *start_elem,
2636                  const bracket_elem_t *end_elem)
2637 # endif /* not RE_ENABLE_I18N */
2638 {
2639   unsigned int start_ch, end_ch;
2640   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2641   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2642           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643           0))
2644     return REG_ERANGE;
2645
2646   /* We can handle no multi character collating elements without libc
2647      support.  */
2648   if (BE ((start_elem->type == COLL_SYM
2649            && strlen ((char *) start_elem->opr.name) > 1)
2650           || (end_elem->type == COLL_SYM
2651               && strlen ((char *) end_elem->opr.name) > 1), 0))
2652     return REG_ECOLLATE;
2653
2654 # ifdef RE_ENABLE_I18N
2655   {
2656     wchar_t wc;
2657     wint_t start_wc;
2658     wint_t end_wc;
2659     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660
2661     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663                    : 0));
2664     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666                  : 0));
2667     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668                 ? __btowc (start_ch) : start_elem->opr.wch);
2669     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670               ? __btowc (end_ch) : end_elem->opr.wch);
2671     if (start_wc == WEOF || end_wc == WEOF)
2672       return REG_ECOLLATE;
2673     cmp_buf[0] = start_wc;
2674     cmp_buf[4] = end_wc;
2675
2676     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2677             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2678       return REG_ERANGE;
2679
2680     /* Got valid collation sequence values, add them as a new entry.
2681        However, for !_LIBC we have no collation elements: if the
2682        character set is single byte, the single byte character set
2683        that we build below suffices.  parse_bracket_exp passes
2684        no MBCSET if dfa->mb_cur_max == 1.  */
2685     if (mbcset)
2686       {
2687         /* Check the space of the arrays.  */
2688         if (BE (*range_alloc == mbcset->nranges, 0))
2689           {
2690             /* There is not enough space, need realloc.  */
2691             wchar_t *new_array_start, *new_array_end;
2692             Idx new_nranges;
2693
2694             /* +1 in case of mbcset->nranges is 0.  */
2695             new_nranges = 2 * mbcset->nranges + 1;
2696             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697                are NULL if *range_alloc == 0.  */
2698             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2699                                           new_nranges);
2700             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2701                                         new_nranges);
2702
2703             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2704               return REG_ESPACE;
2705
2706             mbcset->range_starts = new_array_start;
2707             mbcset->range_ends = new_array_end;
2708             *range_alloc = new_nranges;
2709           }
2710
2711         mbcset->range_starts[mbcset->nranges] = start_wc;
2712         mbcset->range_ends[mbcset->nranges++] = end_wc;
2713       }
2714
2715     /* Build the table for single byte characters.  */
2716     for (wc = 0; wc < SBC_MAX; ++wc)
2717       {
2718         cmp_buf[2] = wc;
2719         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2720             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2721           bitset_set (sbcset, wc);
2722       }
2723   }
2724 # else /* not RE_ENABLE_I18N */
2725   {
2726     unsigned int ch;
2727     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2728                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2729                    : 0));
2730     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2731               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2732                  : 0));
2733     if (start_ch > end_ch)
2734       return REG_ERANGE;
2735     /* Build the table for single byte characters.  */
2736     for (ch = 0; ch < SBC_MAX; ++ch)
2737       if (start_ch <= ch  && ch <= end_ch)
2738         bitset_set (sbcset, ch);
2739   }
2740 # endif /* not RE_ENABLE_I18N */
2741   return REG_NOERROR;
2742 }
2743 #endif /* not _LIBC */
2744
2745 #ifndef _LIBC
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747    Build the collating element which is represented by NAME.
2748    The result are written to MBCSET and SBCSET.
2749    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750    pointer argument since we may update it.  */
2751
2752 static reg_errcode_t
2753 internal_function
2754 build_collating_symbol (bitset_t sbcset,
2755 # ifdef RE_ENABLE_I18N
2756                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2757 # endif
2758                         const unsigned char *name)
2759 {
2760   size_t name_len = strlen ((const char *) name);
2761   if (BE (name_len != 1, 0))
2762     return REG_ECOLLATE;
2763   else
2764     {
2765       bitset_set (sbcset, name[0]);
2766       return REG_NOERROR;
2767     }
2768 }
2769 #endif /* not _LIBC */
2770
2771 /* This function parse bracket expression like "[abc]", "[a-c]",
2772    "[[.a-a.]]" etc.  */
2773
2774 static bin_tree_t *
2775 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2776                    reg_syntax_t syntax, reg_errcode_t *err)
2777 {
2778 #ifdef _LIBC
2779   const unsigned char *collseqmb;
2780   const char *collseqwc;
2781   uint32_t nrules;
2782   int32_t table_size;
2783   const int32_t *symb_table;
2784   const unsigned char *extra;
2785
2786   /* Local function for parse_bracket_exp used in _LIBC environement.
2787      Seek the collating symbol entry correspondings to NAME.
2788      Return the index of the symbol in the SYMB_TABLE.  */
2789
2790   auto inline int32_t
2791   __attribute ((always_inline))
2792   seek_collating_symbol_entry (name, name_len)
2793          const unsigned char *name;
2794          size_t name_len;
2795     {
2796       int32_t hash = elem_hash ((const char *) name, name_len);
2797       int32_t elem = hash % table_size;
2798       if (symb_table[2 * elem] != 0)
2799         {
2800           int32_t second = hash % (table_size - 2) + 1;
2801
2802           do
2803             {
2804               /* First compare the hashing value.  */
2805               if (symb_table[2 * elem] == hash
2806                   /* Compare the length of the name.  */
2807                   && name_len == extra[symb_table[2 * elem + 1]]
2808                   /* Compare the name.  */
2809                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2810                              name_len) == 0)
2811                 {
2812                   /* Yep, this is the entry.  */
2813                   break;
2814                 }
2815
2816               /* Next entry.  */
2817               elem += second;
2818             }
2819           while (symb_table[2 * elem] != 0);
2820         }
2821       return elem;
2822     }
2823
2824   /* Local function for parse_bracket_exp used in _LIBC environment.
2825      Look up the collation sequence value of BR_ELEM.
2826      Return the value if succeeded, UINT_MAX otherwise.  */
2827
2828   auto inline unsigned int
2829   __attribute ((always_inline))
2830   lookup_collation_sequence_value (br_elem)
2831          bracket_elem_t *br_elem;
2832     {
2833       if (br_elem->type == SB_CHAR)
2834         {
2835           /*
2836           if (MB_CUR_MAX == 1)
2837           */
2838           if (nrules == 0)
2839             return collseqmb[br_elem->opr.ch];
2840           else
2841             {
2842               wint_t wc = __btowc (br_elem->opr.ch);
2843               return __collseq_table_lookup (collseqwc, wc);
2844             }
2845         }
2846       else if (br_elem->type == MB_CHAR)
2847         {
2848           if (nrules != 0)
2849             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2850         }
2851       else if (br_elem->type == COLL_SYM)
2852         {
2853           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2854           if (nrules != 0)
2855             {
2856               int32_t elem, idx;
2857               elem = seek_collating_symbol_entry (br_elem->opr.name,
2858                                                   sym_name_len);
2859               if (symb_table[2 * elem] != 0)
2860                 {
2861                   /* We found the entry.  */
2862                   idx = symb_table[2 * elem + 1];
2863                   /* Skip the name of collating element name.  */
2864                   idx += 1 + extra[idx];
2865                   /* Skip the byte sequence of the collating element.  */
2866                   idx += 1 + extra[idx];
2867                   /* Adjust for the alignment.  */
2868                   idx = (idx + 3) & ~3;
2869                   /* Skip the multibyte collation sequence value.  */
2870                   idx += sizeof (unsigned int);
2871                   /* Skip the wide char sequence of the collating element.  */
2872                   idx += sizeof (unsigned int) *
2873                     (1 + *(unsigned int *) (extra + idx));
2874                   /* Return the collation sequence value.  */
2875                   return *(unsigned int *) (extra + idx);
2876                 }
2877               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2878                 {
2879                   /* No valid character.  Match it as a single byte
2880                      character.  */
2881                   return collseqmb[br_elem->opr.name[0]];
2882                 }
2883             }
2884           else if (sym_name_len == 1)
2885             return collseqmb[br_elem->opr.name[0]];
2886         }
2887       return UINT_MAX;
2888     }
2889
2890   /* Local function for parse_bracket_exp used in _LIBC environement.
2891      Build the range expression which starts from START_ELEM, and ends
2892      at END_ELEM.  The result are written to MBCSET and SBCSET.
2893      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2894      mbcset->range_ends, is a pointer argument sinse we may
2895      update it.  */
2896
2897   auto inline reg_errcode_t
2898   __attribute ((always_inline))
2899   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2900          re_charset_t *mbcset;
2901          Idx *range_alloc;
2902          bitset_t sbcset;
2903          bracket_elem_t *start_elem, *end_elem;
2904     {
2905       unsigned int ch;
2906       uint32_t start_collseq;
2907       uint32_t end_collseq;
2908
2909       /* Equivalence Classes and Character Classes can't be a range
2910          start/end.  */
2911       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2912               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2913               0))
2914         return REG_ERANGE;
2915
2916       start_collseq = lookup_collation_sequence_value (start_elem);
2917       end_collseq = lookup_collation_sequence_value (end_elem);
2918       /* Check start/end collation sequence values.  */
2919       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2920         return REG_ECOLLATE;
2921       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2922         return REG_ERANGE;
2923
2924       /* Got valid collation sequence values, add them as a new entry.
2925          However, if we have no collation elements, and the character set
2926          is single byte, the single byte character set that we
2927          build below suffices. */
2928       if (nrules > 0 || dfa->mb_cur_max > 1)
2929         {
2930           /* Check the space of the arrays.  */
2931           if (BE (*range_alloc == mbcset->nranges, 0))
2932             {
2933               /* There is not enough space, need realloc.  */
2934               uint32_t *new_array_start;
2935               uint32_t *new_array_end;
2936               Idx new_nranges;
2937
2938               /* +1 in case of mbcset->nranges is 0.  */
2939               new_nranges = 2 * mbcset->nranges + 1;
2940               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2941                                             new_nranges);
2942               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2943                                           new_nranges);
2944
2945               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2946                 return REG_ESPACE;
2947
2948               mbcset->range_starts = new_array_start;
2949               mbcset->range_ends = new_array_end;
2950               *range_alloc = new_nranges;
2951             }
2952
2953           mbcset->range_starts[mbcset->nranges] = start_collseq;
2954           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2955         }
2956
2957       /* Build the table for single byte characters.  */
2958       for (ch = 0; ch < SBC_MAX; ch++)
2959         {
2960           uint32_t ch_collseq;
2961           /*
2962           if (MB_CUR_MAX == 1)
2963           */
2964           if (nrules == 0)
2965             ch_collseq = collseqmb[ch];
2966           else
2967             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2968           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2969             bitset_set (sbcset, ch);
2970         }
2971       return REG_NOERROR;
2972     }
2973
2974   /* Local function for parse_bracket_exp used in _LIBC environement.
2975      Build the collating element which is represented by NAME.
2976      The result are written to MBCSET and SBCSET.
2977      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2978      pointer argument sinse we may update it.  */
2979
2980   auto inline reg_errcode_t
2981   __attribute ((always_inline))
2982   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2983          re_charset_t *mbcset;
2984          Idx *coll_sym_alloc;
2985          bitset_t sbcset;
2986          const unsigned char *name;
2987     {
2988       int32_t elem, idx;
2989       size_t name_len = strlen ((const char *) name);
2990       if (nrules != 0)
2991         {
2992           elem = seek_collating_symbol_entry (name, name_len);
2993           if (symb_table[2 * elem] != 0)
2994             {
2995               /* We found the entry.  */
2996               idx = symb_table[2 * elem + 1];
2997               /* Skip the name of collating element name.  */
2998               idx += 1 + extra[idx];
2999             }
3000           else if (symb_table[2 * elem] == 0 && name_len == 1)
3001             {
3002               /* No valid character, treat it as a normal
3003                  character.  */
3004               bitset_set (sbcset, name[0]);
3005               return REG_NOERROR;
3006             }
3007           else
3008             return REG_ECOLLATE;
3009
3010           /* Got valid collation sequence, add it as a new entry.  */
3011           /* Check the space of the arrays.  */
3012           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3013             {
3014               /* Not enough, realloc it.  */
3015               /* +1 in case of mbcset->ncoll_syms is 0.  */
3016               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3017               /* Use realloc since mbcset->coll_syms is NULL
3018                  if *alloc == 0.  */
3019               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3020                                                    new_coll_sym_alloc);
3021               if (BE (new_coll_syms == NULL, 0))
3022                 return REG_ESPACE;
3023               mbcset->coll_syms = new_coll_syms;
3024               *coll_sym_alloc = new_coll_sym_alloc;
3025             }
3026           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3027           return REG_NOERROR;
3028         }
3029       else
3030         {
3031           if (BE (name_len != 1, 0))
3032             return REG_ECOLLATE;
3033           else
3034             {
3035               bitset_set (sbcset, name[0]);
3036               return REG_NOERROR;
3037             }
3038         }
3039     }
3040 #endif
3041
3042   re_token_t br_token;
3043   re_bitset_ptr_t sbcset;
3044 #ifdef RE_ENABLE_I18N
3045   re_charset_t *mbcset;
3046   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3047   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3048 #endif /* not RE_ENABLE_I18N */
3049   bool non_match = false;
3050   bin_tree_t *work_tree;
3051   int token_len;
3052   bool first_round = true;
3053 #ifdef _LIBC
3054   collseqmb = (const unsigned char *)
3055     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3056   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3057   if (nrules)
3058     {
3059       /*
3060       if (MB_CUR_MAX > 1)
3061       */
3062       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3063       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3064       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3065                                                   _NL_COLLATE_SYMB_TABLEMB);
3066       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3067                                                    _NL_COLLATE_SYMB_EXTRAMB);
3068     }
3069 #endif
3070   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3071 #ifdef RE_ENABLE_I18N
3072   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3073 #endif /* RE_ENABLE_I18N */
3074 #ifdef RE_ENABLE_I18N
3075   if (BE (sbcset == NULL || mbcset == NULL, 0))
3076 #else
3077   if (BE (sbcset == NULL, 0))
3078 #endif /* RE_ENABLE_I18N */
3079     {
3080       *err = REG_ESPACE;
3081       return NULL;
3082     }
3083
3084   token_len = peek_token_bracket (token, regexp, syntax);
3085   if (BE (token->type == END_OF_RE, 0))
3086     {
3087       *err = REG_BADPAT;
3088       goto parse_bracket_exp_free_return;
3089     }
3090   if (token->type == OP_NON_MATCH_LIST)
3091     {
3092 #ifdef RE_ENABLE_I18N
3093       mbcset->non_match = 1;
3094 #endif /* not RE_ENABLE_I18N */
3095       non_match = true;
3096       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3097         bitset_set (sbcset, '\n');
3098       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3099       token_len = peek_token_bracket (token, regexp, syntax);
3100       if (BE (token->type == END_OF_RE, 0))
3101         {
3102           *err = REG_BADPAT;
3103           goto parse_bracket_exp_free_return;
3104         }
3105     }
3106
3107   /* We treat the first ']' as a normal character.  */
3108   if (token->type == OP_CLOSE_BRACKET)
3109     token->type = CHARACTER;
3110
3111   while (1)
3112     {
3113       bracket_elem_t start_elem, end_elem;
3114       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3115       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3116       reg_errcode_t ret;
3117       int token_len2 = 0;
3118       bool is_range_exp = false;
3119       re_token_t token2;
3120
3121       start_elem.opr.name = start_name_buf;
3122       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3123                                    syntax, first_round);
3124       if (BE (ret != REG_NOERROR, 0))
3125         {
3126           *err = ret;
3127           goto parse_bracket_exp_free_return;
3128         }
3129       first_round = false;
3130
3131       /* Get information about the next token.  We need it in any case.  */
3132       token_len = peek_token_bracket (token, regexp, syntax);
3133
3134       /* Do not check for ranges if we know they are not allowed.  */
3135       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3136         {
3137           if (BE (token->type == END_OF_RE, 0))
3138             {
3139               *err = REG_EBRACK;
3140               goto parse_bracket_exp_free_return;
3141             }
3142           if (token->type == OP_CHARSET_RANGE)
3143             {
3144               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3145               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3146               if (BE (token2.type == END_OF_RE, 0))
3147                 {
3148                   *err = REG_EBRACK;
3149                   goto parse_bracket_exp_free_return;
3150                 }
3151               if (token2.type == OP_CLOSE_BRACKET)
3152                 {
3153                   /* We treat the last '-' as a normal character.  */
3154                   re_string_skip_bytes (regexp, -token_len);
3155                   token->type = CHARACTER;
3156                 }
3157               else
3158                 is_range_exp = true;
3159             }
3160         }
3161
3162       if (is_range_exp == true)
3163         {
3164           end_elem.opr.name = end_name_buf;
3165           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3166                                        dfa, syntax, true);
3167           if (BE (ret != REG_NOERROR, 0))
3168             {
3169               *err = ret;
3170               goto parse_bracket_exp_free_return;
3171             }
3172
3173           token_len = peek_token_bracket (token, regexp, syntax);
3174
3175 #ifdef _LIBC
3176           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3177                                   &start_elem, &end_elem);
3178 #else
3179 # ifdef RE_ENABLE_I18N
3180           *err = build_range_exp (syntax, sbcset,
3181                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3182                                   &range_alloc, &start_elem, &end_elem);
3183 # else
3184           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3185 # endif
3186 #endif /* RE_ENABLE_I18N */
3187           if (BE (*err != REG_NOERROR, 0))
3188             goto parse_bracket_exp_free_return;
3189         }
3190       else
3191         {
3192           switch (start_elem.type)
3193             {
3194             case SB_CHAR:
3195               bitset_set (sbcset, start_elem.opr.ch);
3196               break;
3197 #ifdef RE_ENABLE_I18N
3198             case MB_CHAR:
3199               /* Check whether the array has enough space.  */
3200               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3201                 {
3202                   wchar_t *new_mbchars;
3203                   /* Not enough, realloc it.  */
3204                   /* +1 in case of mbcset->nmbchars is 0.  */
3205                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3206                   /* Use realloc since array is NULL if *alloc == 0.  */
3207                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3208                                             mbchar_alloc);
3209                   if (BE (new_mbchars == NULL, 0))
3210                     goto parse_bracket_exp_espace;
3211                   mbcset->mbchars = new_mbchars;
3212                 }
3213               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3214               break;
3215 #endif /* RE_ENABLE_I18N */
3216             case EQUIV_CLASS:
3217               *err = build_equiv_class (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219                                         mbcset, &equiv_class_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221                                         start_elem.opr.name);
3222               if (BE (*err != REG_NOERROR, 0))
3223                 goto parse_bracket_exp_free_return;
3224               break;
3225             case COLL_SYM:
3226               *err = build_collating_symbol (sbcset,
3227 #ifdef RE_ENABLE_I18N
3228                                              mbcset, &coll_sym_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230                                              start_elem.opr.name);
3231               if (BE (*err != REG_NOERROR, 0))
3232                 goto parse_bracket_exp_free_return;
3233               break;
3234             case CHAR_CLASS:
3235               *err = build_charclass (regexp->trans, sbcset,
3236 #ifdef RE_ENABLE_I18N
3237                                       mbcset, &char_class_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239                                       start_elem.opr.name, syntax);
3240               if (BE (*err != REG_NOERROR, 0))
3241                goto parse_bracket_exp_free_return;
3242               break;
3243             default:
3244               assert (0);
3245               break;
3246             }
3247         }
3248       if (BE (token->type == END_OF_RE, 0))
3249         {
3250           *err = REG_EBRACK;
3251           goto parse_bracket_exp_free_return;
3252         }
3253       if (token->type == OP_CLOSE_BRACKET)
3254         break;
3255     }
3256
3257   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3258
3259   /* If it is non-matching list.  */
3260   if (non_match)
3261     bitset_not (sbcset);
3262
3263 #ifdef RE_ENABLE_I18N
3264   /* Ensure only single byte characters are set.  */
3265   if (dfa->mb_cur_max > 1)
3266     bitset_mask (sbcset, dfa->sb_char);
3267
3268   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3269       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3270                                                      || mbcset->non_match)))
3271     {
3272       bin_tree_t *mbc_tree;
3273       int sbc_idx;
3274       /* Build a tree for complex bracket.  */
3275       dfa->has_mb_node = 1;
3276       br_token.type = COMPLEX_BRACKET;
3277       br_token.opr.mbcset = mbcset;
3278       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3279       if (BE (mbc_tree == NULL, 0))
3280         goto parse_bracket_exp_espace;
3281       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3282         if (sbcset[sbc_idx])
3283           break;
3284       /* If there are no bits set in sbcset, there is no point
3285          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3286       if (sbc_idx < BITSET_WORDS)
3287         {
3288           /* Build a tree for simple bracket.  */
3289           br_token.type = SIMPLE_BRACKET;
3290           br_token.opr.sbcset = sbcset;
3291           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292           if (BE (work_tree == NULL, 0))
3293             goto parse_bracket_exp_espace;
3294
3295           /* Then join them by ALT node.  */
3296           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3297           if (BE (work_tree == NULL, 0))
3298             goto parse_bracket_exp_espace;
3299         }
3300       else
3301         {
3302           re_free (sbcset);
3303           work_tree = mbc_tree;
3304         }
3305     }
3306   else
3307 #endif /* not RE_ENABLE_I18N */
3308     {
3309 #ifdef RE_ENABLE_I18N
3310       free_charset (mbcset);
3311 #endif
3312       /* Build a tree for simple bracket.  */
3313       br_token.type = SIMPLE_BRACKET;
3314       br_token.opr.sbcset = sbcset;
3315       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3316       if (BE (work_tree == NULL, 0))
3317         goto parse_bracket_exp_espace;
3318     }
3319   return work_tree;
3320
3321  parse_bracket_exp_espace:
3322   *err = REG_ESPACE;
3323  parse_bracket_exp_free_return:
3324   re_free (sbcset);
3325 #ifdef RE_ENABLE_I18N
3326   free_charset (mbcset);
3327 #endif /* RE_ENABLE_I18N */
3328   return NULL;
3329 }
3330
3331 /* Parse an element in the bracket expression.  */
3332
3333 static reg_errcode_t
3334 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3335                        re_token_t *token, int token_len, re_dfa_t *dfa,
3336                        reg_syntax_t syntax, bool accept_hyphen)
3337 {
3338 #ifdef RE_ENABLE_I18N
3339   int cur_char_size;
3340   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3341   if (cur_char_size > 1)
3342     {
3343       elem->type = MB_CHAR;
3344       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3345       re_string_skip_bytes (regexp, cur_char_size);
3346       return REG_NOERROR;
3347     }
3348 #endif /* RE_ENABLE_I18N */
3349   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3350   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3351       || token->type == OP_OPEN_EQUIV_CLASS)
3352     return parse_bracket_symbol (elem, regexp, token);
3353   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3354     {
3355       /* A '-' must only appear as anything but a range indicator before
3356          the closing bracket.  Everything else is an error.  */
3357       re_token_t token2;
3358       (void) peek_token_bracket (&token2, regexp, syntax);
3359       if (token2.type != OP_CLOSE_BRACKET)
3360         /* The actual error value is not standardized since this whole
3361            case is undefined.  But ERANGE makes good sense.  */
3362         return REG_ERANGE;
3363     }
3364   elem->type = SB_CHAR;
3365   elem->opr.ch = token->opr.c;
3366   return REG_NOERROR;
3367 }
3368
3369 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3370    such as [:<character_class>:], [.<collating_element>.], and
3371    [=<equivalent_class>=].  */
3372
3373 static reg_errcode_t
3374 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3375                       re_token_t *token)
3376 {
3377   unsigned char ch, delim = token->opr.c;
3378   int i = 0;
3379   if (re_string_eoi(regexp))
3380     return REG_EBRACK;
3381   for (;; ++i)
3382     {
3383       if (i >= BRACKET_NAME_BUF_SIZE)
3384         return REG_EBRACK;
3385       if (token->type == OP_OPEN_CHAR_CLASS)
3386         ch = re_string_fetch_byte_case (regexp);
3387       else
3388         ch = re_string_fetch_byte (regexp);
3389       if (re_string_eoi(regexp))
3390         return REG_EBRACK;
3391       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3392         break;
3393       elem->opr.name[i] = ch;
3394     }
3395   re_string_skip_bytes (regexp, 1);
3396   elem->opr.name[i] = '\0';
3397   switch (token->type)
3398     {
3399     case OP_OPEN_COLL_ELEM:
3400       elem->type = COLL_SYM;
3401       break;
3402     case OP_OPEN_EQUIV_CLASS:
3403       elem->type = EQUIV_CLASS;
3404       break;
3405     case OP_OPEN_CHAR_CLASS:
3406       elem->type = CHAR_CLASS;
3407       break;
3408     default:
3409       break;
3410     }
3411   return REG_NOERROR;
3412 }
3413
3414   /* Helper function for parse_bracket_exp.
3415      Build the equivalence class which is represented by NAME.
3416      The result are written to MBCSET and SBCSET.
3417      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3418      is a pointer argument sinse we may update it.  */
3419
3420 static reg_errcode_t
3421 #ifdef RE_ENABLE_I18N
3422 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3423                    Idx *equiv_class_alloc, const unsigned char *name)
3424 #else /* not RE_ENABLE_I18N */
3425 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3426 #endif /* not RE_ENABLE_I18N */
3427 {
3428 #ifdef _LIBC
3429   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3430   if (nrules != 0)
3431     {
3432       const int32_t *table, *indirect;
3433       const unsigned char *weights, *extra, *cp;
3434       unsigned char char_buf[2];
3435       int32_t idx1, idx2;
3436       unsigned int ch;
3437       size_t len;
3438       /* This #include defines a local function!  */
3439 # include <locale/weight.h>
3440       /* Calculate the index for equivalence class.  */
3441       cp = name;
3442       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3443       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3444                                                _NL_COLLATE_WEIGHTMB);
3445       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3446                                                    _NL_COLLATE_EXTRAMB);
3447       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3448                                                 _NL_COLLATE_INDIRECTMB);
3449       idx1 = findidx (&cp);
3450       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3451         /* This isn't a valid character.  */
3452         return REG_ECOLLATE;
3453
3454       /* Build single byte matcing table for this equivalence class.  */
3455       char_buf[1] = (unsigned char) '\0';
3456       len = weights[idx1 & 0xffffff];
3457       for (ch = 0; ch < SBC_MAX; ++ch)
3458         {
3459           char_buf[0] = ch;
3460           cp = char_buf;
3461           idx2 = findidx (&cp);
3462 /*
3463           idx2 = table[ch];
3464 */
3465           if (idx2 == 0)
3466             /* This isn't a valid character.  */
3467             continue;
3468           /* Compare only if the length matches and the collation rule
3469              index is the same.  */
3470           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3471             {
3472               int cnt = 0;
3473
3474               while (cnt <= len &&
3475                      weights[(idx1 & 0xffffff) + 1 + cnt]
3476                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3477                 ++cnt;
3478
3479               if (cnt > len)
3480                 bitset_set (sbcset, ch);
3481             }
3482         }
3483       /* Check whether the array has enough space.  */
3484       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3485         {
3486           /* Not enough, realloc it.  */
3487           /* +1 in case of mbcset->nequiv_classes is 0.  */
3488           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3489           /* Use realloc since the array is NULL if *alloc == 0.  */
3490           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3491                                                    int32_t,
3492                                                    new_equiv_class_alloc);
3493           if (BE (new_equiv_classes == NULL, 0))
3494             return REG_ESPACE;
3495           mbcset->equiv_classes = new_equiv_classes;
3496           *equiv_class_alloc = new_equiv_class_alloc;
3497         }
3498       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3499     }
3500   else
3501 #endif /* _LIBC */
3502     {
3503       if (BE (strlen ((const char *) name) != 1, 0))
3504         return REG_ECOLLATE;
3505       bitset_set (sbcset, *name);
3506     }
3507   return REG_NOERROR;
3508 }
3509
3510   /* Helper function for parse_bracket_exp.
3511      Build the character class which is represented by NAME.
3512      The result are written to MBCSET and SBCSET.
3513      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3514      is a pointer argument sinse we may update it.  */
3515
3516 static reg_errcode_t
3517 #ifdef RE_ENABLE_I18N
3518 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3519                  re_charset_t *mbcset, Idx *char_class_alloc,
3520                  const unsigned char *class_name, reg_syntax_t syntax)
3521 #else /* not RE_ENABLE_I18N */
3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523                  const unsigned char *class_name, reg_syntax_t syntax)
3524 #endif /* not RE_ENABLE_I18N */
3525 {
3526   int i;
3527   const char *name = (const char *) class_name;
3528
3529   /* In case of REG_ICASE "upper" and "lower" match the both of
3530      upper and lower cases.  */
3531   if ((syntax & RE_ICASE)
3532       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3533     name = "alpha";
3534
3535 #ifdef RE_ENABLE_I18N
3536   /* Check the space of the arrays.  */
3537   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3538     {
3539       /* Not enough, realloc it.  */
3540       /* +1 in case of mbcset->nchar_classes is 0.  */
3541       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3542       /* Use realloc since array is NULL if *alloc == 0.  */
3543       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3544                                                new_char_class_alloc);
3545       if (BE (new_char_classes == NULL, 0))
3546         return REG_ESPACE;
3547       mbcset->char_classes = new_char_classes;
3548       *char_class_alloc = new_char_class_alloc;
3549     }
3550   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3551 #endif /* RE_ENABLE_I18N */
3552
3553 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3554   do {                                          \
3555     if (BE (trans != NULL, 0))                  \
3556       {                                         \
3557         for (i = 0; i < SBC_MAX; ++i)           \
3558           if (ctype_func (i))                   \
3559             bitset_set (sbcset, trans[i]);      \
3560       }                                         \
3561     else                                        \
3562       {                                         \
3563         for (i = 0; i < SBC_MAX; ++i)           \
3564           if (ctype_func (i))                   \
3565             bitset_set (sbcset, i);             \
3566       }                                         \
3567   } while (0)
3568
3569   if (strcmp (name, "alnum") == 0)
3570     BUILD_CHARCLASS_LOOP (isalnum);
3571   else if (strcmp (name, "cntrl") == 0)
3572     BUILD_CHARCLASS_LOOP (iscntrl);
3573   else if (strcmp (name, "lower") == 0)
3574     BUILD_CHARCLASS_LOOP (islower);
3575   else if (strcmp (name, "space") == 0)
3576     BUILD_CHARCLASS_LOOP (isspace);
3577   else if (strcmp (name, "alpha") == 0)
3578     BUILD_CHARCLASS_LOOP (isalpha);
3579   else if (strcmp (name, "digit") == 0)
3580     BUILD_CHARCLASS_LOOP (isdigit);
3581   else if (strcmp (name, "print") == 0)
3582     BUILD_CHARCLASS_LOOP (isprint);
3583   else if (strcmp (name, "upper") == 0)
3584     BUILD_CHARCLASS_LOOP (isupper);
3585   else if (strcmp (name, "blank") == 0)
3586     BUILD_CHARCLASS_LOOP (isblank);
3587   else if (strcmp (name, "graph") == 0)
3588     BUILD_CHARCLASS_LOOP (isgraph);
3589   else if (strcmp (name, "punct") == 0)
3590     BUILD_CHARCLASS_LOOP (ispunct);
3591   else if (strcmp (name, "xdigit") == 0)
3592     BUILD_CHARCLASS_LOOP (isxdigit);
3593   else
3594     return REG_ECTYPE;
3595
3596   return REG_NOERROR;
3597 }
3598
3599 static bin_tree_t *
3600 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3601                     const unsigned char *class_name,
3602                     const unsigned char *extra, bool non_match,
3603                     reg_errcode_t *err)
3604 {
3605   re_bitset_ptr_t sbcset;
3606 #ifdef RE_ENABLE_I18N
3607   re_charset_t *mbcset;
3608   Idx alloc = 0;
3609 #endif /* not RE_ENABLE_I18N */
3610   reg_errcode_t ret;
3611   re_token_t br_token;
3612   bin_tree_t *tree;
3613
3614   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3615 #ifdef RE_ENABLE_I18N
3616   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3617 #endif /* RE_ENABLE_I18N */
3618
3619 #ifdef RE_ENABLE_I18N
3620   if (BE (sbcset == NULL || mbcset == NULL, 0))
3621 #else /* not RE_ENABLE_I18N */
3622   if (BE (sbcset == NULL, 0))
3623 #endif /* not RE_ENABLE_I18N */
3624     {
3625       *err = REG_ESPACE;
3626       return NULL;
3627     }
3628
3629   if (non_match)
3630     {
3631 #ifdef RE_ENABLE_I18N
3632       mbcset->non_match = 1;
3633 #endif /* not RE_ENABLE_I18N */
3634     }
3635
3636   /* We don't care the syntax in this case.  */
3637   ret = build_charclass (trans, sbcset,
3638 #ifdef RE_ENABLE_I18N
3639                          mbcset, &alloc,
3640 #endif /* RE_ENABLE_I18N */
3641                          class_name, 0);
3642
3643   if (BE (ret != REG_NOERROR, 0))
3644     {
3645       re_free (sbcset);
3646 #ifdef RE_ENABLE_I18N
3647       free_charset (mbcset);
3648 #endif /* RE_ENABLE_I18N */
3649       *err = ret;
3650       return NULL;
3651     }
3652   /* \w match '_' also.  */
3653   for (; *extra; extra++)
3654     bitset_set (sbcset, *extra);
3655
3656   /* If it is non-matching list.  */
3657   if (non_match)
3658     bitset_not (sbcset);
3659
3660 #ifdef RE_ENABLE_I18N
3661   /* Ensure only single byte characters are set.  */
3662   if (dfa->mb_cur_max > 1)
3663     bitset_mask (sbcset, dfa->sb_char);
3664 #endif
3665
3666   /* Build a tree for simple bracket.  */
3667   br_token.type = SIMPLE_BRACKET;
3668   br_token.opr.sbcset = sbcset;
3669   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3670   if (BE (tree == NULL, 0))
3671     goto build_word_op_espace;
3672
3673 #ifdef RE_ENABLE_I18N
3674   if (dfa->mb_cur_max > 1)
3675     {
3676       bin_tree_t *mbc_tree;
3677       /* Build a tree for complex bracket.  */
3678       br_token.type = COMPLEX_BRACKET;
3679       br_token.opr.mbcset = mbcset;
3680       dfa->has_mb_node = 1;
3681       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3682       if (BE (mbc_tree == NULL, 0))
3683         goto build_word_op_espace;
3684       /* Then join them by ALT node.  */
3685       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3686       if (BE (mbc_tree != NULL, 1))
3687         return tree;
3688     }
3689   else
3690     {
3691       free_charset (mbcset);
3692       return tree;
3693     }
3694 #else /* not RE_ENABLE_I18N */
3695   return tree;
3696 #endif /* not RE_ENABLE_I18N */
3697
3698  build_word_op_espace:
3699   re_free (sbcset);
3700 #ifdef RE_ENABLE_I18N
3701   free_charset (mbcset);
3702 #endif /* RE_ENABLE_I18N */
3703   *err = REG_ESPACE;
3704   return NULL;
3705 }
3706
3707 /* This is intended for the expressions like "a{1,3}".
3708    Fetch a number from `input', and return the number.
3709    Return REG_MISSING if the number field is empty like "{,1}".
3710    Return REG_ERROR if an error occurred.  */
3711
3712 static Idx
3713 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3714 {
3715   Idx num = REG_MISSING;
3716   unsigned char c;
3717   while (1)
3718     {
3719       fetch_token (token, input, syntax);
3720       c = token->opr.c;
3721       if (BE (token->type == END_OF_RE, 0))
3722         return REG_ERROR;
3723       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3724         break;
3725       num = ((token->type != CHARACTER || c < '0' || '9' < c
3726               || num == REG_ERROR)
3727              ? REG_ERROR
3728              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3729       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3730     }
3731   return num;
3732 }
3733 \f
3734 #ifdef RE_ENABLE_I18N
3735 static void
3736 free_charset (re_charset_t *cset)
3737 {
3738   re_free (cset->mbchars);
3739 # ifdef _LIBC
3740   re_free (cset->coll_syms);
3741   re_free (cset->equiv_classes);
3742   re_free (cset->range_starts);
3743   re_free (cset->range_ends);
3744 # endif
3745   re_free (cset->char_classes);
3746   re_free (cset);
3747 }
3748 #endif /* RE_ENABLE_I18N */
3749 \f
3750 /* Functions for binary tree operation.  */
3751
3752 /* Create a tree node.  */
3753
3754 static bin_tree_t *
3755 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3756              re_token_type_t type)
3757 {
3758   re_token_t t;
3759   t.type = type;
3760   return create_token_tree (dfa, left, right, &t);
3761 }
3762
3763 static bin_tree_t *
3764 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3765                    const re_token_t *token)
3766 {
3767   bin_tree_t *tree;
3768   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3769     {
3770       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3771
3772       if (storage == NULL)
3773         return NULL;
3774       storage->next = dfa->str_tree_storage;
3775       dfa->str_tree_storage = storage;
3776       dfa->str_tree_storage_idx = 0;
3777     }
3778   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3779
3780   tree->parent = NULL;
3781   tree->left = left;
3782   tree->right = right;
3783   tree->token = *token;
3784   tree->token.duplicated = 0;
3785   tree->token.opt_subexp = 0;
3786   tree->first = NULL;
3787   tree->next = NULL;
3788   tree->node_idx = REG_MISSING;
3789
3790   if (left != NULL)
3791     left->parent = tree;
3792   if (right != NULL)
3793     right->parent = tree;
3794   return tree;
3795 }
3796
3797 /* Mark the tree SRC as an optional subexpression.
3798    To be called from preorder or postorder.  */
3799
3800 static reg_errcode_t
3801 mark_opt_subexp (void *extra, bin_tree_t *node)
3802 {
3803   Idx idx = (Idx) (long) extra;
3804   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3805     node->token.opt_subexp = 1;
3806
3807   return REG_NOERROR;
3808 }
3809
3810 /* Free the allocated memory inside NODE. */
3811
3812 static void
3813 free_token (re_token_t *node)
3814 {
3815 #ifdef RE_ENABLE_I18N
3816   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3817     free_charset (node->opr.mbcset);
3818   else
3819 #endif /* RE_ENABLE_I18N */
3820     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3821       re_free (node->opr.sbcset);
3822 }
3823
3824 /* Worker function for tree walking.  Free the allocated memory inside NODE
3825    and its children. */
3826
3827 static reg_errcode_t
3828 free_tree (void *extra, bin_tree_t *node)
3829 {
3830   free_token (&node->token);
3831   return REG_NOERROR;
3832 }
3833
3834
3835 /* Duplicate the node SRC, and return new node.  This is a preorder
3836    visit similar to the one implemented by the generic visitor, but
3837    we need more infrastructure to maintain two parallel trees --- so,
3838    it's easier to duplicate.  */
3839
3840 static bin_tree_t *
3841 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3842 {
3843   const bin_tree_t *node;
3844   bin_tree_t *dup_root;
3845   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3846
3847   for (node = root; ; )
3848     {
3849       /* Create a new tree and link it back to the current parent.  */
3850       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3851       if (*p_new == NULL)
3852         return NULL;
3853       (*p_new)->parent = dup_node;
3854       (*p_new)->token.duplicated = 1;
3855       dup_node = *p_new;
3856
3857       /* Go to the left node, or up and to the right.  */
3858       if (node->left)
3859         {
3860           node = node->left;
3861           p_new = &dup_node->left;
3862         }
3863       else
3864         {
3865           const bin_tree_t *prev = NULL;
3866           while (node->right == prev || node->right == NULL)
3867             {
3868               prev = node;
3869               node = node->parent;
3870               dup_node = dup_node->parent;
3871               if (!node)
3872                 return dup_root;
3873             }
3874           node = node->right;
3875           p_new = &dup_node->right;
3876         }
3877     }
3878 }