git.gag.com Git - debian/amanda/blob - gnulib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License along
  18    with this program; if not, write to the Free Software Foundation,
  19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          Idx node, bool root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static Idx fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   Idx nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             bool accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         Idx *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       Idx *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        bool non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 static const char __re_error_msgid[] =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 static const size_t __re_error_msgid_idx[] =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 #ifdef _LIBC
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 #else /* size_t might promote */
 220 const char *
 221 re_compile_pattern (const char *pattern, size_t length,
 222                     struct re_pattern_buffer *bufp)
 223 #endif
 224 {
 225   reg_errcode_t ret;
 226
 227   /* And GNU code determines whether or not to get register information
 228      by passing null for the REGS argument to re_match, etc., not by
 229      setting no_sub, unless RE_NO_SUB is set.  */
 230   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 231
 232   /* Match anchors at newline.  */
 233   bufp->newline_anchor = 1;
 234
 235   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 236
 237   if (!ret)
 238     return NULL;
 239   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 240 }
 241 #ifdef _LIBC
 242 weak_alias (__re_compile_pattern, re_compile_pattern)
 243 #endif
 244
 245 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 246    also be assigned to arbitrarily: each pattern buffer stores its own
 247    syntax, so it can be changed between regex compilations.  */
 248 /* This has no initializer because initialized variables in Emacs
 249    become read-only after dumping.  */
 250 reg_syntax_t re_syntax_options;
 251
 252
 253 /* Specify the precise syntax of regexps for compilation.  This provides
 254    for compatibility for various utilities which historically have
 255    different, incompatible syntaxes.
 256
 257    The argument SYNTAX is a bit mask comprised of the various bits
 258    defined in regex.h.  We return the old syntax.  */
 259
 260 reg_syntax_t
 261 re_set_syntax (syntax)
 262     reg_syntax_t syntax;
 263 {
 264   reg_syntax_t ret = re_syntax_options;
 265
 266   re_syntax_options = syntax;
 267   return ret;
 268 }
 269 #ifdef _LIBC
 270 weak_alias (__re_set_syntax, re_set_syntax)
 271 #endif
 272
 273 int
 274 re_compile_fastmap (bufp)
 275     struct re_pattern_buffer *bufp;
 276 {
 277   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 278   char *fastmap = bufp->fastmap;
 279
 280   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 281   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 282   if (dfa->init_state != dfa->init_state_word)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 284   if (dfa->init_state != dfa->init_state_nl)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 286   if (dfa->init_state != dfa->init_state_begbuf)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 288   bufp->fastmap_accurate = 1;
 289   return 0;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 293 #endif
 294
 295 static inline void
 296 __attribute ((always_inline))
 297 re_set_fastmap (char *fastmap, bool icase, int ch)
 298 {
 299   fastmap[ch] = 1;
 300   if (icase)
 301     fastmap[tolower (ch)] = 1;
 302 }
 303
 304 /* Helper function for re_compile_fastmap.
 305    Compile fastmap for the initial_state INIT_STATE.  */
 306
 307 static void
 308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 309                          char *fastmap)
 310 {
 311   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 312   Idx node_cnt;
 313   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 314   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 315     {
 316       Idx node = init_state->nodes.elems[node_cnt];
 317       re_token_type_t type = dfa->nodes[node].type;
 318
 319       if (type == CHARACTER)
 320         {
 321           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 322 #ifdef RE_ENABLE_I18N
 323           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 324             {
 325               unsigned char buf[MB_LEN_MAX];
 326               unsigned char *p;
 327               wchar_t wc;
 328               mbstate_t state;
 329
 330               p = buf;
 331               *p++ = dfa->nodes[node].opr.c;
 332               while (++node < dfa->nodes_len
 333                      && dfa->nodes[node].type == CHARACTER
 334                      && dfa->nodes[node].mb_partial)
 335                 *p++ = dfa->nodes[node].opr.c;
 336               memset (&state, '\0', sizeof (state));
 337               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 338                              &state) == p - buf
 339                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 340                       != (size_t) -1))
 341                 re_set_fastmap (fastmap, false, buf[0]);
 342             }
 343 #endif
 344         }
 345       else if (type == SIMPLE_BRACKET)
 346         {
 347           int i, ch;
 348           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 349             {
 350               int j;
 351               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 352               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 353                 if (w & ((bitset_word_t) 1 << j))
 354                   re_set_fastmap (fastmap, icase, ch);
 355             }
 356         }
 357 #ifdef RE_ENABLE_I18N
 358       else if (type == COMPLEX_BRACKET)
 359         {
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           Idx i;
 362
 363 # ifdef _LIBC
 364           /* See if we have to try all bytes which start multiple collation
 365              elements.
 366              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 367                   collation element, and don't catch 'b' since 'b' is
 368                   the only collation element which starts from 'b' (and
 369                   it is caught by SIMPLE_BRACKET).  */
 370               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 371                   && (cset->ncoll_syms || cset->nranges))
 372                 {
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # endif /* _LIBC */
 380
 381           /* See if we have to start the match at all multibyte characters,
 382              i.e. where we would not find an invalid sequence.  This only
 383              applies to multibyte character sets; for single byte character
 384              sets, the SIMPLE_BRACKET again suffices.  */
 385           if (dfa->mb_cur_max > 1
 386               && (cset->nchar_classes || cset->non_match || cset->nranges
 387 # ifdef _LIBC
 388                   || cset->nequiv_classes
 389 # endif /* _LIBC */
 390                  ))
 391             {
 392               unsigned char c = 0;
 393               do
 394                 {
 395                   mbstate_t mbs;
 396                   memset (&mbs, 0, sizeof (mbs));
 397                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 398                     re_set_fastmap (fastmap, false, (int) c);
 399                 }
 400               while (++c != 0);
 401             }
 402
 403           else
 404             {
 405               /* ... Else catch all bytes which can start the mbchars.  */
 406               for (i = 0; i < cset->nmbchars; ++i)
 407                 {
 408                   char buf[256];
 409                   mbstate_t state;
 410                   memset (&state, '\0', sizeof (state));
 411                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 412                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 413                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 414                     {
 415                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 416                           != (size_t) -1)
 417                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 418                     }
 419                 }
 420             }
 421         }
 422 #endif /* RE_ENABLE_I18N */
 423       else if (type == OP_PERIOD
 424 #ifdef RE_ENABLE_I18N
 425                || type == OP_UTF8_PERIOD
 426 #endif /* RE_ENABLE_I18N */
 427                || type == END_OF_RE)
 428         {
 429           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 430           if (type == END_OF_RE)
 431             bufp->can_be_null = 1;
 432           return;
 433         }
 434     }
 435 }
 436 \f
 437 /* Entry point for POSIX code.  */
 438 /* regcomp takes a regular expression as a string and compiles it.
 439
 440    PREG is a regex_t *.  We do not expect any fields to be initialized,
 441    since POSIX says we shouldn't.  Thus, we set
 442
 443      `buffer' to the compiled pattern;
 444      `used' to the length of the compiled pattern;
 445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 447        RE_SYNTAX_POSIX_BASIC;
 448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 449      `fastmap' to an allocated space for the fastmap;
 450      `fastmap_accurate' to zero;
 451      `re_nsub' to the number of subexpressions in PATTERN.
 452
 453    PATTERN is the address of the pattern string.
 454
 455    CFLAGS is a series of bits which affect compilation.
 456
 457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 458      use POSIX basic syntax.
 459
 460      If REG_NEWLINE is set, then . and [^...] don't match newline.
 461      Also, regexec will try a match beginning after every newline.
 462
 463      If REG_ICASE is set, then we considers upper- and lowercase
 464      versions of letters to be equivalent when matching.
 465
 466      If REG_NOSUB is set, then when PREG is passed to regexec, that
 467      routine will report only success or failure, and nothing about the
 468      registers.
 469
 470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 471    the return codes and their meanings.)  */
 472
 473 int
 474 regcomp (preg, pattern, cflags)
 475     regex_t *_Restrict_ preg;
 476     const char *_Restrict_ pattern;
 477     int cflags;
 478 {
 479   reg_errcode_t ret;
 480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 481                          : RE_SYNTAX_POSIX_BASIC);
 482
 483   preg->buffer = NULL;
 484   preg->allocated = 0;
 485   preg->used = 0;
 486
 487   /* Try to allocate space for the fastmap.  */
 488   preg->fastmap = re_malloc (char, SBC_MAX);
 489   if (BE (preg->fastmap == NULL, 0))
 490     return REG_ESPACE;
 491
 492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 493
 494   /* If REG_NEWLINE is set, newlines are treated differently.  */
 495   if (cflags & REG_NEWLINE)
 496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 497       syntax &= ~RE_DOT_NEWLINE;
 498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 499       /* It also changes the matching behavior.  */
 500       preg->newline_anchor = 1;
 501     }
 502   else
 503     preg->newline_anchor = 0;
 504   preg->no_sub = !!(cflags & REG_NOSUB);
 505   preg->translate = NULL;
 506
 507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 508
 509   /* POSIX doesn't distinguish between an unmatched open-group and an
 510      unmatched close-group: both are REG_EPAREN.  */
 511   if (ret == REG_ERPAREN)
 512     ret = REG_EPAREN;
 513
 514   /* We have already checked preg->fastmap != NULL.  */
 515   if (BE (ret == REG_NOERROR, 1))
 516     /* Compute the fastmap now, since regexec cannot modify the pattern
 517        buffer.  This function never fails in this implementation.  */
 518     (void) re_compile_fastmap (preg);
 519   else
 520     {
 521       /* Some error occurred while compiling the expression.  */
 522       re_free (preg->fastmap);
 523       preg->fastmap = NULL;
 524     }
 525
 526   return (int) ret;
 527 }
 528 #ifdef _LIBC
 529 weak_alias (__regcomp, regcomp)
 530 #endif
 531
 532 /* Returns a message corresponding to an error code, ERRCODE, returned
 533    from either regcomp or regexec.   We don't use PREG here.  */
 534
 535 #ifdef _LIBC
 536 size_t
 537 regerror (errcode, preg, errbuf, errbuf_size)
 538     int errcode;
 539     const regex_t *_Restrict_ preg;
 540     char *_Restrict_ errbuf;
 541     size_t errbuf_size;
 542 #else /* size_t might promote */
 543 size_t
 544 regerror (int errcode, const regex_t *_Restrict_ preg,
 545           char *_Restrict_ errbuf, size_t errbuf_size)
 546 #endif
 547 {
 548   const char *msg;
 549   size_t msg_size;
 550
 551   if (BE (errcode < 0
 552           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 553                                / sizeof (__re_error_msgid_idx[0])), 0))
 554     /* Only error codes returned by the rest of the code should be passed
 555        to this routine.  If we are given anything else, or if other regex
 556        code generates an invalid error code, then the program has a bug.
 557        Dump core so we can fix it.  */
 558     abort ();
 559
 560   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 561
 562   msg_size = strlen (msg) + 1; /* Includes the null.  */
 563
 564   if (BE (errbuf_size != 0, 1))
 565     {
 566       size_t cpy_size = msg_size;
 567       if (BE (msg_size > errbuf_size, 0))
 568         {
 569           cpy_size = errbuf_size - 1;
 570           errbuf[cpy_size] = '\0';
 571         }
 572       memcpy (errbuf, msg, cpy_size);
 573     }
 574
 575   return msg_size;
 576 }
 577 #ifdef _LIBC
 578 weak_alias (__regerror, regerror)
 579 #endif
 580
 581
 582 #ifdef RE_ENABLE_I18N
 583 /* This static array is used for the map to single-byte characters when
 584    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 585    it the same all the time.  UTF-8 is the preferred encoding so this is
 586    a worthwhile optimization.  */
 587 static const bitset_t utf8_sb_map =
 588 {
 589   /* Set the first 128 bits.  */
 590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 591 #  error "bitset_word_t is narrower than 32 bits"
 592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 593   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX,
 598 # endif
 599   (BITSET_WORD_MAX
 600    >> (SBC_MAX % BITSET_WORD_BITS == 0
 601        ? 0
 602        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 603 };
 604 #endif
 605
 606
 607 static void
 608 free_dfa_content (re_dfa_t *dfa)
 609 {
 610   Idx i, j;
 611
 612   if (dfa->nodes)
 613     for (i = 0; i < dfa->nodes_len; ++i)
 614       free_token (dfa->nodes + i);
 615   re_free (dfa->nexts);
 616   for (i = 0; i < dfa->nodes_len; ++i)
 617     {
 618       if (dfa->eclosures != NULL)
 619         re_node_set_free (dfa->eclosures + i);
 620       if (dfa->inveclosures != NULL)
 621         re_node_set_free (dfa->inveclosures + i);
 622       if (dfa->edests != NULL)
 623         re_node_set_free (dfa->edests + i);
 624     }
 625   re_free (dfa->edests);
 626   re_free (dfa->eclosures);
 627   re_free (dfa->inveclosures);
 628   re_free (dfa->nodes);
 629
 630   if (dfa->state_table)
 631     for (i = 0; i <= dfa->state_hash_mask; ++i)
 632       {
 633         struct re_state_table_entry *entry = dfa->state_table + i;
 634         for (j = 0; j < entry->num; ++j)
 635           {
 636             re_dfastate_t *state = entry->array[j];
 637             free_state (state);
 638           }
 639         re_free (entry->array);
 640       }
 641   re_free (dfa->state_table);
 642 #ifdef RE_ENABLE_I18N
 643   if (dfa->sb_char != utf8_sb_map)
 644     re_free (dfa->sb_char);
 645 #endif
 646   re_free (dfa->subexp_map);
 647 #ifdef DEBUG
 648   re_free (dfa->re_str);
 649 #endif
 650
 651   re_free (dfa);
 652 }
 653
 654
 655 /* Free dynamically allocated space used by PREG.  */
 656
 657 void
 658 regfree (preg)
 659     regex_t *preg;
 660 {
 661   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 662   if (BE (dfa != NULL, 1))
 663     free_dfa_content (dfa);
 664   preg->buffer = NULL;
 665   preg->allocated = 0;
 666
 667   re_free (preg->fastmap);
 668   preg->fastmap = NULL;
 669
 670   re_free (preg->translate);
 671   preg->translate = NULL;
 672 }
 673 #ifdef _LIBC
 674 weak_alias (__regfree, regfree)
 675 #endif
 676 \f
 677 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 678    them unless specifically requested.  */
 679
 680 #if defined _REGEX_RE_COMP || defined _LIBC
 681
 682 /* BSD has one and only one pattern buffer.  */
 683 static struct re_pattern_buffer re_comp_buf;
 684
 685 char *
 686 # ifdef _LIBC
 687 /* Make these definitions weak in libc, so POSIX programs can redefine
 688    these names if they don't use our functions, and still use
 689    regcomp/regexec above without link errors.  */
 690 weak_function
 691 # endif
 692 re_comp (s)
 693      const char *s;
 694 {
 695   reg_errcode_t ret;
 696   char *fastmap;
 697
 698   if (!s)
 699     {
 700       if (!re_comp_buf.buffer)
 701         return gettext ("No previous regular expression");
 702       return 0;
 703     }
 704
 705   if (re_comp_buf.buffer)
 706     {
 707       fastmap = re_comp_buf.fastmap;
 708       re_comp_buf.fastmap = NULL;
 709       __regfree (&re_comp_buf);
 710       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 711       re_comp_buf.fastmap = fastmap;
 712     }
 713
 714   if (re_comp_buf.fastmap == NULL)
 715     {
 716       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 717       if (re_comp_buf.fastmap == NULL)
 718         return (char *) gettext (__re_error_msgid
 719                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 720     }
 721
 722   /* Since `re_exec' always passes NULL for the `regs' argument, we
 723      don't need to initialize the pattern buffer fields which affect it.  */
 724
 725   /* Match anchors at newlines.  */
 726   re_comp_buf.newline_anchor = 1;
 727
 728   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 729
 730   if (!ret)
 731     return NULL;
 732
 733   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 734   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 735 }
 736
 737 #ifdef _LIBC
 738 libc_freeres_fn (free_mem)
 739 {
 740   __regfree (&re_comp_buf);
 741 }
 742 #endif
 743
 744 #endif /* _REGEX_RE_COMP */
 745 \f
 746 /* Internal entry point.
 747    Compile the regular expression PATTERN, whose length is LENGTH.
 748    SYNTAX indicate regular expression's syntax.  */
 749
 750 static reg_errcode_t
 751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 752                      reg_syntax_t syntax)
 753 {
 754   reg_errcode_t err = REG_NOERROR;
 755   re_dfa_t *dfa;
 756   re_string_t regexp;
 757
 758   /* Initialize the pattern buffer.  */
 759   preg->fastmap_accurate = 0;
 760   preg->syntax = syntax;
 761   preg->not_bol = preg->not_eol = 0;
 762   preg->used = 0;
 763   preg->re_nsub = 0;
 764   preg->can_be_null = 0;
 765   preg->regs_allocated = REGS_UNALLOCATED;
 766
 767   /* Initialize the dfa.  */
 768   dfa = (re_dfa_t *) preg->buffer;
 769   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 770     {
 771       /* If zero allocated, but buffer is non-null, try to realloc
 772          enough space.  This loses if buffer's address is bogus, but
 773          that is the user's responsibility.  If ->buffer is NULL this
 774          is a simple allocation.  */
 775       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 776       if (dfa == NULL)
 777         return REG_ESPACE;
 778       preg->allocated = sizeof (re_dfa_t);
 779       preg->buffer = (unsigned char *) dfa;
 780     }
 781   preg->used = sizeof (re_dfa_t);
 782
 783   err = init_dfa (dfa, length);
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786       free_dfa_content (dfa);
 787       preg->buffer = NULL;
 788       preg->allocated = 0;
 789       return err;
 790     }
 791 #ifdef DEBUG
 792   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 793   dfa->re_str = re_malloc (char, length + 1);
 794   strncpy (dfa->re_str, pattern, length + 1);
 795 #endif
 796
 797   __libc_lock_init (dfa->lock);
 798
 799   err = re_string_construct (&regexp, pattern, length, preg->translate,
 800                              (syntax & RE_ICASE) != 0, dfa);
 801   if (BE (err != REG_NOERROR, 0))
 802     {
 803     re_compile_internal_free_return:
 804       free_workarea_compile (preg);
 805       re_string_destruct (&regexp);
 806       free_dfa_content (dfa);
 807       preg->buffer = NULL;
 808       preg->allocated = 0;
 809       return err;
 810     }
 811
 812   /* Parse the regular expression, and build a structure tree.  */
 813   preg->re_nsub = 0;
 814   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 815   if (BE (dfa->str_tree == NULL, 0))
 816     goto re_compile_internal_free_return;
 817
 818   /* Analyze the tree and create the nfa.  */
 819   err = analyze (preg);
 820   if (BE (err != REG_NOERROR, 0))
 821     goto re_compile_internal_free_return;
 822
 823 #ifdef RE_ENABLE_I18N
 824   /* If possible, do searching in single byte encoding to speed things up.  */
 825   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 826     optimize_utf8 (dfa);
 827 #endif
 828
 829   /* Then create the initial state of the dfa.  */
 830   err = create_initial_state (dfa);
 831
 832   /* Release work areas.  */
 833   free_workarea_compile (preg);
 834   re_string_destruct (&regexp);
 835
 836   if (BE (err != REG_NOERROR, 0))
 837     {
 838       free_dfa_content (dfa);
 839       preg->buffer = NULL;
 840       preg->allocated = 0;
 841     }
 842
 843   return err;
 844 }
 845
 846 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 847    as the initial length of some arrays.  */
 848
 849 static reg_errcode_t
 850 init_dfa (re_dfa_t *dfa, size_t pat_len)
 851 {
 852   __re_size_t table_size;
 853 #ifndef _LIBC
 854   char *codeset_name;
 855 #endif
 856 #ifdef RE_ENABLE_I18N
 857   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 858 #else
 859   size_t max_i18n_object_size = 0;
 860 #endif
 861   size_t max_object_size =
 862     MAX (sizeof (struct re_state_table_entry),
 863          MAX (sizeof (re_token_t),
 864               MAX (sizeof (re_node_set),
 865                    MAX (sizeof (regmatch_t),
 866                         max_i18n_object_size))));
 867
 868   memset (dfa, '\0', sizeof (re_dfa_t));
 869
 870   /* Force allocation of str_tree_storage the first time.  */
 871   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 872
 873   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 874      calculation below, and for similar doubling calculations
 875      elsewhere.  And it's <= rather than <, because some of the
 876      doubling calculations add 1 afterwards.  */
 877   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 878     return REG_ESPACE;
 879
 880   dfa->nodes_alloc = pat_len + 1;
 881   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 882
 883   /*  table_size = 2 ^ ceil(log pat_len) */
 884   for (table_size = 1; ; table_size <<= 1)
 885     if (table_size > pat_len)
 886       break;
 887
 888   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 889   dfa->state_hash_mask = table_size - 1;
 890
 891   dfa->mb_cur_max = MB_CUR_MAX;
 892 #ifdef _LIBC
 893   if (dfa->mb_cur_max == 6
 894       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 895     dfa->is_utf8 = 1;
 896   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 897                        != 0);
 898 #else
 899   codeset_name = nl_langinfo (CODESET);
 900   if (strcasecmp (codeset_name, "UTF-8") == 0
 901       || strcasecmp (codeset_name, "UTF8") == 0)
 902     dfa->is_utf8 = 1;
 903
 904   /* We check exhaustively in the loop below if this charset is a
 905      superset of ASCII.  */
 906   dfa->map_notascii = 0;
 907 #endif
 908
 909 #ifdef RE_ENABLE_I18N
 910   if (dfa->mb_cur_max > 1)
 911     {
 912       if (dfa->is_utf8)
 913         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 914       else
 915         {
 916           int i, j, ch;
 917
 918           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 919           if (BE (dfa->sb_char == NULL, 0))
 920             return REG_ESPACE;
 921
 922           /* Set the bits corresponding to single byte chars.  */
 923           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 924             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 925               {
 926                 wint_t wch = __btowc (ch);
 927                 if (wch != WEOF)
 928                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 929 # ifndef _LIBC
 930                 if (isascii (ch) && wch != ch)
 931                   dfa->map_notascii = 1;
 932 # endif
 933               }
 934         }
 935     }
 936 #endif
 937
 938   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 939     return REG_ESPACE;
 940   return REG_NOERROR;
 941 }
 942
 943 /* Initialize WORD_CHAR table, which indicate which character is
 944    "word".  In this case "word" means that it is the word construction
 945    character used by some operators like "\<", "\>", etc.  */
 946
 947 static void
 948 internal_function
 949 init_word_char (re_dfa_t *dfa)
 950 {
 951   int i, j, ch;
 952   dfa->word_ops_used = 1;
 953   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 954     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 955       if (isalnum (ch) || ch == '_')
 956         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 957 }
 958
 959 /* Free the work area which are only used while compiling.  */
 960
 961 static void
 962 free_workarea_compile (regex_t *preg)
 963 {
 964   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 965   bin_tree_storage_t *storage, *next;
 966   for (storage = dfa->str_tree_storage; storage; storage = next)
 967     {
 968       next = storage->next;
 969       re_free (storage);
 970     }
 971   dfa->str_tree_storage = NULL;
 972   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 973   dfa->str_tree = NULL;
 974   re_free (dfa->org_indices);
 975   dfa->org_indices = NULL;
 976 }
 977
 978 /* Create initial states for all contexts.  */
 979
 980 static reg_errcode_t
 981 create_initial_state (re_dfa_t *dfa)
 982 {
 983   Idx first, i;
 984   reg_errcode_t err;
 985   re_node_set init_nodes;
 986
 987   /* Initial states have the epsilon closure of the node which is
 988      the first node of the regular expression.  */
 989   first = dfa->str_tree->first->node_idx;
 990   dfa->init_node = first;
 991   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 992   if (BE (err != REG_NOERROR, 0))
 993     return err;
 994
 995   /* The back-references which are in initial states can epsilon transit,
 996      since in this case all of the subexpressions can be null.
 997      Then we add epsilon closures of the nodes which are the next nodes of
 998      the back-references.  */
 999   if (dfa->nbackref > 0)
1000     for (i = 0; i < init_nodes.nelem; ++i)
1001       {
1002         Idx node_idx = init_nodes.elems[i];
1003         re_token_type_t type = dfa->nodes[node_idx].type;
1004
1005         Idx clexp_idx;
1006         if (type != OP_BACK_REF)
1007           continue;
1008         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1009           {
1010             re_token_t *clexp_node;
1011             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1012             if (clexp_node->type == OP_CLOSE_SUBEXP
1013                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1014               break;
1015           }
1016         if (clexp_idx == init_nodes.nelem)
1017           continue;
1018
1019         if (type == OP_BACK_REF)
1020           {
1021             Idx dest_idx = dfa->edests[node_idx].elems[0];
1022             if (!re_node_set_contains (&init_nodes, dest_idx))
1023               {
1024                 reg_errcode_t merge_err
1025                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026                 if (merge_err != REG_NOERROR)
1027                   return merge_err;
1028                 i = 0;
1029               }
1030           }
1031       }
1032
1033   /* It must be the first time to invoke acquire_state.  */
1034   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1035   /* We don't check ERR here, since the initial state must not be NULL.  */
1036   if (BE (dfa->init_state == NULL, 0))
1037     return err;
1038   if (dfa->init_state->has_constraint)
1039     {
1040       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1041                                                        CONTEXT_WORD);
1042       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1043                                                      CONTEXT_NEWLINE);
1044       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1045                                                          &init_nodes,
1046                                                          CONTEXT_NEWLINE
1047                                                          | CONTEXT_BEGBUF);
1048       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1049               || dfa->init_state_begbuf == NULL, 0))
1050         return err;
1051     }
1052   else
1053     dfa->init_state_word = dfa->init_state_nl
1054       = dfa->init_state_begbuf = dfa->init_state;
1055
1056   re_node_set_free (&init_nodes);
1057   return REG_NOERROR;
1058 }
1059 \f
1060 #ifdef RE_ENABLE_I18N
1061 /* If it is possible to do searching in single byte encoding instead of UTF-8
1062    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063    DFA nodes where needed.  */
1064
1065 static void
1066 optimize_utf8 (re_dfa_t *dfa)
1067 {
1068   Idx node;
1069   int i;
1070   bool mb_chars = false;
1071   bool has_period = false;
1072
1073   for (node = 0; node < dfa->nodes_len; ++node)
1074     switch (dfa->nodes[node].type)
1075       {
1076       case CHARACTER:
1077         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1078           mb_chars = true;
1079         break;
1080       case ANCHOR:
1081         switch (dfa->nodes[node].opr.ctx_type)
1082           {
1083           case LINE_FIRST:
1084           case LINE_LAST:
1085           case BUF_FIRST:
1086           case BUF_LAST:
1087             break;
1088           default:
1089             /* Word anchors etc. cannot be handled.  It's okay to test
1090                opr.ctx_type since constraints (for all DFA nodes) are
1091                created by ORing one or more opr.ctx_type values.  */
1092             return;
1093           }
1094         break;
1095       case OP_PERIOD:
1096         has_period = true;
1097         break;
1098       case OP_BACK_REF:
1099       case OP_ALT:
1100       case END_OF_RE:
1101       case OP_DUP_ASTERISK:
1102       case OP_OPEN_SUBEXP:
1103       case OP_CLOSE_SUBEXP:
1104         break;
1105       case COMPLEX_BRACKET:
1106         return;
1107       case SIMPLE_BRACKET:
1108         /* Just double check.  */
1109         {
1110           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1111                         ? 0
1112                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1113           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1114             {
1115               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1116                 return;
1117               rshift = 0;
1118             }
1119         }
1120         break;
1121       default:
1122         abort ();
1123       }
1124
1125   if (mb_chars || has_period)
1126     for (node = 0; node < dfa->nodes_len; ++node)
1127       {
1128         if (dfa->nodes[node].type == CHARACTER
1129             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1130           dfa->nodes[node].mb_partial = 0;
1131         else if (dfa->nodes[node].type == OP_PERIOD)
1132           dfa->nodes[node].type = OP_UTF8_PERIOD;
1133       }
1134
1135   /* The search can be in single byte locale.  */
1136   dfa->mb_cur_max = 1;
1137   dfa->is_utf8 = 0;
1138   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139 }
1140 #endif
1141 \f
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143    "eclosure", and "inveclosure".  */
1144
1145 static reg_errcode_t
1146 analyze (regex_t *preg)
1147 {
1148   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149   reg_errcode_t ret;
1150
1151   /* Allocate arrays.  */
1152   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1153   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1154   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157           || dfa->eclosures == NULL, 0))
1158     return REG_ESPACE;
1159
1160   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1161   if (dfa->subexp_map != NULL)
1162     {
1163       Idx i;
1164       for (i = 0; i < preg->re_nsub; i++)
1165         dfa->subexp_map[i] = i;
1166       preorder (dfa->str_tree, optimize_subexps, dfa);
1167       for (i = 0; i < preg->re_nsub; i++)
1168         if (dfa->subexp_map[i] != i)
1169           break;
1170       if (i == preg->re_nsub)
1171         {
1172           free (dfa->subexp_map);
1173           dfa->subexp_map = NULL;
1174         }
1175     }
1176
1177   ret = postorder (dfa->str_tree, lower_subexps, preg);
1178   if (BE (ret != REG_NOERROR, 0))
1179     return ret;
1180   ret = postorder (dfa->str_tree, calc_first, dfa);
1181   if (BE (ret != REG_NOERROR, 0))
1182     return ret;
1183   preorder (dfa->str_tree, calc_next, dfa);
1184   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185   if (BE (ret != REG_NOERROR, 0))
1186     return ret;
1187   ret = calc_eclosure (dfa);
1188   if (BE (ret != REG_NOERROR, 0))
1189     return ret;
1190
1191   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1193   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194       || dfa->nbackref)
1195     {
1196       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197       if (BE (dfa->inveclosures == NULL, 0))
1198         return REG_ESPACE;
1199       ret = calc_inveclosure (dfa);
1200     }
1201
1202   return ret;
1203 }
1204
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206    implement parse tree visits.  Instead, we use parent pointers and
1207    some hairy code in these two functions.  */
1208 static reg_errcode_t
1209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210            void *extra)
1211 {
1212   bin_tree_t *node, *prev;
1213
1214   for (node = root; ; )
1215     {
1216       /* Descend down the tree, preferably to the left (or to the right
1217          if that's the only child).  */
1218       while (node->left || node->right)
1219         if (node->left)
1220           node = node->left;
1221         else
1222           node = node->right;
1223
1224       do
1225         {
1226           reg_errcode_t err = fn (extra, node);
1227           if (BE (err != REG_NOERROR, 0))
1228             return err;
1229           if (node->parent == NULL)
1230             return REG_NOERROR;
1231           prev = node;
1232           node = node->parent;
1233         }
1234       /* Go up while we have a node that is reached from the right.  */
1235       while (node->right == prev || node->right == NULL);
1236       node = node->right;
1237     }
1238 }
1239
1240 static reg_errcode_t
1241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242           void *extra)
1243 {
1244   bin_tree_t *node;
1245
1246   for (node = root; ; )
1247     {
1248       reg_errcode_t err = fn (extra, node);
1249       if (BE (err != REG_NOERROR, 0))
1250         return err;
1251
1252       /* Go to the left node, or up and to the right.  */
1253       if (node->left)
1254         node = node->left;
1255       else
1256         {
1257           bin_tree_t *prev = NULL;
1258           while (node->right == prev || node->right == NULL)
1259             {
1260               prev = node;
1261               node = node->parent;
1262               if (!node)
1263                 return REG_NOERROR;
1264             }
1265           node = node->right;
1266         }
1267     }
1268 }
1269
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1272    backreferences as well.  Requires a preorder visit.  */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra, bin_tree_t *node)
1275 {
1276   re_dfa_t *dfa = (re_dfa_t *) extra;
1277
1278   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279     {
1280       int idx = node->token.opr.idx;
1281       node->token.opr.idx = dfa->subexp_map[idx];
1282       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283     }
1284
1285   else if (node->token.type == SUBEXP
1286            && node->left && node->left->token.type == SUBEXP)
1287     {
1288       Idx other_idx = node->left->token.opr.idx;
1289
1290       node->left = node->left->left;
1291       if (node->left)
1292         node->left->parent = node;
1293
1294       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295       if (other_idx < BITSET_WORD_BITS)
1296         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1297     }
1298
1299   return REG_NOERROR;
1300 }
1301
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1304 static reg_errcode_t
1305 lower_subexps (void *extra, bin_tree_t *node)
1306 {
1307   regex_t *preg = (regex_t *) extra;
1308   reg_errcode_t err = REG_NOERROR;
1309
1310   if (node->left && node->left->token.type == SUBEXP)
1311     {
1312       node->left = lower_subexp (&err, preg, node->left);
1313       if (node->left)
1314         node->left->parent = node;
1315     }
1316   if (node->right && node->right->token.type == SUBEXP)
1317     {
1318       node->right = lower_subexp (&err, preg, node->right);
1319       if (node->right)
1320         node->right->parent = node;
1321     }
1322
1323   return err;
1324 }
1325
1326 static bin_tree_t *
1327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328 {
1329   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330   bin_tree_t *body = node->left;
1331   bin_tree_t *op, *cls, *tree1, *tree;
1332
1333   if (preg->no_sub
1334       /* We do not optimize empty subexpressions, because otherwise we may
1335          have bad CONCAT nodes with NULL children.  This is obviously not
1336          very common, so we do not lose much.  An example that triggers
1337          this case is the sed "script" /\(\)/x.  */
1338       && node->left != NULL
1339       && (node->token.opr.idx >= BITSET_WORD_BITS
1340           || !(dfa->used_bkref_map
1341                & ((bitset_word_t) 1 << node->token.opr.idx))))
1342     return node->left;
1343
1344   /* Convert the SUBEXP node to the concatenation of an
1345      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1346   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349   tree = create_tree (dfa, op, tree1, CONCAT);
1350   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351     {
1352       *err = REG_ESPACE;
1353       return NULL;
1354     }
1355
1356   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358   return tree;
1359 }
1360
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362    nodes.  Requires a postorder visit.  */
1363 static reg_errcode_t
1364 calc_first (void *extra, bin_tree_t *node)
1365 {
1366   re_dfa_t *dfa = (re_dfa_t *) extra;
1367   if (node->token.type == CONCAT)
1368     {
1369       node->first = node->left->first;
1370       node->node_idx = node->left->node_idx;
1371     }
1372   else
1373     {
1374       node->first = node;
1375       node->node_idx = re_dfa_add_node (dfa, node->token);
1376       if (BE (node->node_idx == REG_MISSING, 0))
1377         return REG_ESPACE;
1378       if (node->token.type == ANCHOR)
1379         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380     }
1381   return REG_NOERROR;
1382 }
1383
1384 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1385 static reg_errcode_t
1386 calc_next (void *extra, bin_tree_t *node)
1387 {
1388   switch (node->token.type)
1389     {
1390     case OP_DUP_ASTERISK:
1391       node->left->next = node;
1392       break;
1393     case CONCAT:
1394       node->left->next = node->right->first;
1395       node->right->next = node->next;
1396       break;
1397     default:
1398       if (node->left)
1399         node->left->next = node->next;
1400       if (node->right)
1401         node->right->next = node->next;
1402       break;
1403     }
1404   return REG_NOERROR;
1405 }
1406
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra, bin_tree_t *node)
1410 {
1411   re_dfa_t *dfa = (re_dfa_t *) extra;
1412   Idx idx = node->node_idx;
1413   reg_errcode_t err = REG_NOERROR;
1414
1415   switch (node->token.type)
1416     {
1417     case CONCAT:
1418       break;
1419
1420     case END_OF_RE:
1421       assert (node->next == NULL);
1422       break;
1423
1424     case OP_DUP_ASTERISK:
1425     case OP_ALT:
1426       {
1427         Idx left, right;
1428         dfa->has_plural_match = 1;
1429         if (node->left != NULL)
1430           left = node->left->first->node_idx;
1431         else
1432           left = node->next->node_idx;
1433         if (node->right != NULL)
1434           right = node->right->first->node_idx;
1435         else
1436           right = node->next->node_idx;
1437         assert (REG_VALID_INDEX (left));
1438         assert (REG_VALID_INDEX (right));
1439         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440       }
1441       break;
1442
1443     case ANCHOR:
1444     case OP_OPEN_SUBEXP:
1445     case OP_CLOSE_SUBEXP:
1446       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447       break;
1448
1449     case OP_BACK_REF:
1450       dfa->nexts[idx] = node->next->node_idx;
1451       if (node->token.type == OP_BACK_REF)
1452         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453       break;
1454
1455     default:
1456       assert (!IS_EPSILON_NODE (node->token.type));
1457       dfa->nexts[idx] = node->next->node_idx;
1458       break;
1459     }
1460
1461   return err;
1462 }
1463
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466    to their own constraint.  */
1467
1468 static reg_errcode_t
1469 internal_function
1470 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1471                         Idx root_node, unsigned int init_constraint)
1472 {
1473   Idx org_node, clone_node;
1474   bool ok;
1475   unsigned int constraint = init_constraint;
1476   for (org_node = top_org_node, clone_node = top_clone_node;;)
1477     {
1478       Idx org_dest, clone_dest;
1479       if (dfa->nodes[org_node].type == OP_BACK_REF)
1480         {
1481           /* If the back reference epsilon-transit, its destination must
1482              also have the constraint.  Then duplicate the epsilon closure
1483              of the destination of the back reference, and store it in
1484              edests of the back reference.  */
1485           org_dest = dfa->nexts[org_node];
1486           re_node_set_empty (dfa->edests + clone_node);
1487           clone_dest = duplicate_node (dfa, org_dest, constraint);
1488           if (BE (clone_dest == REG_MISSING, 0))
1489             return REG_ESPACE;
1490           dfa->nexts[clone_node] = dfa->nexts[org_node];
1491           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492           if (BE (! ok, 0))
1493             return REG_ESPACE;
1494         }
1495       else if (dfa->edests[org_node].nelem == 0)
1496         {
1497           /* In case of the node can't epsilon-transit, don't duplicate the
1498              destination and store the original destination as the
1499              destination of the node.  */
1500           dfa->nexts[clone_node] = dfa->nexts[org_node];
1501           break;
1502         }
1503       else if (dfa->edests[org_node].nelem == 1)
1504         {
1505           /* In case of the node can epsilon-transit, and it has only one
1506              destination.  */
1507           org_dest = dfa->edests[org_node].elems[0];
1508           re_node_set_empty (dfa->edests + clone_node);
1509           /* If the node is root_node itself, it means the epsilon closure
1510              has a loop.  Then tie it to the destination of the root_node.  */
1511           if (org_node == root_node && clone_node != org_node)
1512             {
1513               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1514               if (BE (! ok, 0))
1515                 return REG_ESPACE;
1516               break;
1517             }
1518           /* In case the node has another constraint, append it.  */
1519           constraint |= dfa->nodes[org_node].constraint;
1520           clone_dest = duplicate_node (dfa, org_dest, constraint);
1521           if (BE (clone_dest == REG_MISSING, 0))
1522             return REG_ESPACE;
1523           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524           if (BE (! ok, 0))
1525             return REG_ESPACE;
1526         }
1527       else /* dfa->edests[org_node].nelem == 2 */
1528         {
1529           /* In case of the node can epsilon-transit, and it has two
1530              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1531           org_dest = dfa->edests[org_node].elems[0];
1532           re_node_set_empty (dfa->edests + clone_node);
1533           /* Search for a duplicated node which satisfies the constraint.  */
1534           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535           if (clone_dest == REG_MISSING)
1536             {
1537               /* There is no such duplicated node, create a new one.  */
1538               reg_errcode_t err;
1539               clone_dest = duplicate_node (dfa, org_dest, constraint);
1540               if (BE (clone_dest == REG_MISSING, 0))
1541                 return REG_ESPACE;
1542               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543               if (BE (! ok, 0))
1544                 return REG_ESPACE;
1545               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1546                                             root_node, constraint);
1547               if (BE (err != REG_NOERROR, 0))
1548                 return err;
1549             }
1550           else
1551             {
1552               /* There is a duplicated node which satisfies the constraint,
1553                  use it to avoid infinite loop.  */
1554               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555               if (BE (! ok, 0))
1556                 return REG_ESPACE;
1557             }
1558
1559           org_dest = dfa->edests[org_node].elems[1];
1560           clone_dest = duplicate_node (dfa, org_dest, constraint);
1561           if (BE (clone_dest == REG_MISSING, 0))
1562             return REG_ESPACE;
1563           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564           if (BE (! ok, 0))
1565             return REG_ESPACE;
1566         }
1567       org_node = org_dest;
1568       clone_node = clone_dest;
1569     }
1570   return REG_NOERROR;
1571 }
1572
1573 /* Search for a node which is duplicated from the node ORG_NODE, and
1574    satisfies the constraint CONSTRAINT.  */
1575
1576 static Idx
1577 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1578                         unsigned int constraint)
1579 {
1580   Idx idx;
1581   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1582     {
1583       if (org_node == dfa->org_indices[idx]
1584           && constraint == dfa->nodes[idx].constraint)
1585         return idx; /* Found.  */
1586     }
1587   return REG_MISSING; /* Not found.  */
1588 }
1589
1590 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591    Return the index of the new node, or REG_MISSING if insufficient storage is
1592    available.  */
1593
1594 static Idx
1595 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1596 {
1597   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1598   if (BE (dup_idx != REG_MISSING, 1))
1599     {
1600       dfa->nodes[dup_idx].constraint = constraint;
1601       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1602       dfa->nodes[dup_idx].duplicated = 1;
1603
1604       /* Store the index of the original node.  */
1605       dfa->org_indices[dup_idx] = org_idx;
1606     }
1607   return dup_idx;
1608 }
1609
1610 static reg_errcode_t
1611 calc_inveclosure (re_dfa_t *dfa)
1612 {
1613   Idx src, idx;
1614   bool ok;
1615   for (idx = 0; idx < dfa->nodes_len; ++idx)
1616     re_node_set_init_empty (dfa->inveclosures + idx);
1617
1618   for (src = 0; src < dfa->nodes_len; ++src)
1619     {
1620       Idx *elems = dfa->eclosures[src].elems;
1621       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1622         {
1623           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624           if (BE (! ok, 0))
1625             return REG_ESPACE;
1626         }
1627     }
1628
1629   return REG_NOERROR;
1630 }
1631
1632 /* Calculate "eclosure" for all the node in DFA.  */
1633
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t *dfa)
1636 {
1637   Idx node_idx;
1638   bool incomplete;
1639 #ifdef DEBUG
1640   assert (dfa->nodes_len > 0);
1641 #endif
1642   incomplete = false;
1643   /* For each nodes, calculate epsilon closure.  */
1644   for (node_idx = 0; ; ++node_idx)
1645     {
1646       reg_errcode_t err;
1647       re_node_set eclosure_elem;
1648       if (node_idx == dfa->nodes_len)
1649         {
1650           if (!incomplete)
1651             break;
1652           incomplete = false;
1653           node_idx = 0;
1654         }
1655
1656 #ifdef DEBUG
1657       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1658 #endif
1659
1660       /* If we have already calculated, skip it.  */
1661       if (dfa->eclosures[node_idx].nelem != 0)
1662         continue;
1663       /* Calculate epsilon closure of `node_idx'.  */
1664       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1665       if (BE (err != REG_NOERROR, 0))
1666         return err;
1667
1668       if (dfa->eclosures[node_idx].nelem == 0)
1669         {
1670           incomplete = true;
1671           re_node_set_free (&eclosure_elem);
1672         }
1673     }
1674   return REG_NOERROR;
1675 }
1676
1677 /* Calculate epsilon closure of NODE.  */
1678
1679 static reg_errcode_t
1680 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1681 {
1682   reg_errcode_t err;
1683   Idx i;
1684   re_node_set eclosure;
1685   bool ok;
1686   bool incomplete = false;
1687   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1688   if (BE (err != REG_NOERROR, 0))
1689     return err;
1690
1691   /* This indicates that we are calculating this node now.
1692      We reference this value to avoid infinite loop.  */
1693   dfa->eclosures[node].nelem = REG_MISSING;
1694
1695   /* If the current node has constraints, duplicate all nodes
1696      since they must inherit the constraints.  */
1697   if (dfa->nodes[node].constraint
1698       && dfa->edests[node].nelem
1699       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1700     {
1701       err = duplicate_node_closure (dfa, node, node, node,
1702                                     dfa->nodes[node].constraint);
1703       if (BE (err != REG_NOERROR, 0))
1704         return err;
1705     }
1706
1707   /* Expand each epsilon destination nodes.  */
1708   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1709     for (i = 0; i < dfa->edests[node].nelem; ++i)
1710       {
1711         re_node_set eclosure_elem;
1712         Idx edest = dfa->edests[node].elems[i];
1713         /* If calculating the epsilon closure of `edest' is in progress,
1714            return intermediate result.  */
1715         if (dfa->eclosures[edest].nelem == REG_MISSING)
1716           {
1717             incomplete = true;
1718             continue;
1719           }
1720         /* If we haven't calculated the epsilon closure of `edest' yet,
1721            calculate now. Otherwise use calculated epsilon closure.  */
1722         if (dfa->eclosures[edest].nelem == 0)
1723           {
1724             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1725             if (BE (err != REG_NOERROR, 0))
1726               return err;
1727           }
1728         else
1729           eclosure_elem = dfa->eclosures[edest];
1730         /* Merge the epsilon closure of `edest'.  */
1731         err = re_node_set_merge (&eclosure, &eclosure_elem);
1732         if (BE (err != REG_NOERROR, 0))
1733           return err;
1734         /* If the epsilon closure of `edest' is incomplete,
1735            the epsilon closure of this node is also incomplete.  */
1736         if (dfa->eclosures[edest].nelem == 0)
1737           {
1738             incomplete = true;
1739             re_node_set_free (&eclosure_elem);
1740           }
1741       }
1742
1743   /* An epsilon closure includes itself.  */
1744   ok = re_node_set_insert (&eclosure, node);
1745   if (BE (! ok, 0))
1746     return REG_ESPACE;
1747   if (incomplete && !root)
1748     dfa->eclosures[node].nelem = 0;
1749   else
1750     dfa->eclosures[node] = eclosure;
1751   *new_set = eclosure;
1752   return REG_NOERROR;
1753 }
1754 \f
1755 /* Functions for token which are used in the parser.  */
1756
1757 /* Fetch a token from INPUT.
1758    We must not use this function inside bracket expressions.  */
1759
1760 static void
1761 internal_function
1762 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1763 {
1764   re_string_skip_bytes (input, peek_token (result, input, syntax));
1765 }
1766
1767 /* Peek a token from INPUT, and return the length of the token.
1768    We must not use this function inside bracket expressions.  */
1769
1770 static int
1771 internal_function
1772 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1773 {
1774   unsigned char c;
1775
1776   if (re_string_eoi (input))
1777     {
1778       token->type = END_OF_RE;
1779       return 0;
1780     }
1781
1782   c = re_string_peek_byte (input, 0);
1783   token->opr.c = c;
1784
1785   token->word_char = 0;
1786 #ifdef RE_ENABLE_I18N
1787   token->mb_partial = 0;
1788   if (input->mb_cur_max > 1 &&
1789       !re_string_first_byte (input, re_string_cur_idx (input)))
1790     {
1791       token->type = CHARACTER;
1792       token->mb_partial = 1;
1793       return 1;
1794     }
1795 #endif
1796   if (c == '\\')
1797     {
1798       unsigned char c2;
1799       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1800         {
1801           token->type = BACK_SLASH;
1802           return 1;
1803         }
1804
1805       c2 = re_string_peek_byte_case (input, 1);
1806       token->opr.c = c2;
1807       token->type = CHARACTER;
1808 #ifdef RE_ENABLE_I18N
1809       if (input->mb_cur_max > 1)
1810         {
1811           wint_t wc = re_string_wchar_at (input,
1812                                           re_string_cur_idx (input) + 1);
1813           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1814         }
1815       else
1816 #endif
1817         token->word_char = IS_WORD_CHAR (c2) != 0;
1818
1819       switch (c2)
1820         {
1821         case '|':
1822           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1823             token->type = OP_ALT;
1824           break;
1825         case '1': case '2': case '3': case '4': case '5':
1826         case '6': case '7': case '8': case '9':
1827           if (!(syntax & RE_NO_BK_REFS))
1828             {
1829               token->type = OP_BACK_REF;
1830               token->opr.idx = c2 - '1';
1831             }
1832           break;
1833         case '<':
1834           if (!(syntax & RE_NO_GNU_OPS))
1835             {
1836               token->type = ANCHOR;
1837               token->opr.ctx_type = WORD_FIRST;
1838             }
1839           break;
1840         case '>':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             {
1843               token->type = ANCHOR;
1844               token->opr.ctx_type = WORD_LAST;
1845             }
1846           break;
1847         case 'b':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = WORD_DELIM;
1852             }
1853           break;
1854         case 'B':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             {
1857               token->type = ANCHOR;
1858               token->opr.ctx_type = NOT_WORD_DELIM;
1859             }
1860           break;
1861         case 'w':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             token->type = OP_WORD;
1864           break;
1865         case 'W':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             token->type = OP_NOTWORD;
1868           break;
1869         case 's':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             token->type = OP_SPACE;
1872           break;
1873         case 'S':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             token->type = OP_NOTSPACE;
1876           break;
1877         case '`':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = BUF_FIRST;
1882             }
1883           break;
1884         case '\'':
1885           if (!(syntax & RE_NO_GNU_OPS))
1886             {
1887               token->type = ANCHOR;
1888               token->opr.ctx_type = BUF_LAST;
1889             }
1890           break;
1891         case '(':
1892           if (!(syntax & RE_NO_BK_PARENS))
1893             token->type = OP_OPEN_SUBEXP;
1894           break;
1895         case ')':
1896           if (!(syntax & RE_NO_BK_PARENS))
1897             token->type = OP_CLOSE_SUBEXP;
1898           break;
1899         case '+':
1900           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1901             token->type = OP_DUP_PLUS;
1902           break;
1903         case '?':
1904           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905             token->type = OP_DUP_QUESTION;
1906           break;
1907         case '{':
1908           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1909             token->type = OP_OPEN_DUP_NUM;
1910           break;
1911         case '}':
1912           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913             token->type = OP_CLOSE_DUP_NUM;
1914           break;
1915         default:
1916           break;
1917         }
1918       return 2;
1919     }
1920
1921   token->type = CHARACTER;
1922 #ifdef RE_ENABLE_I18N
1923   if (input->mb_cur_max > 1)
1924     {
1925       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1926       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1927     }
1928   else
1929 #endif
1930     token->word_char = IS_WORD_CHAR (token->opr.c);
1931
1932   switch (c)
1933     {
1934     case '\n':
1935       if (syntax & RE_NEWLINE_ALT)
1936         token->type = OP_ALT;
1937       break;
1938     case '|':
1939       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1940         token->type = OP_ALT;
1941       break;
1942     case '*':
1943       token->type = OP_DUP_ASTERISK;
1944       break;
1945     case '+':
1946       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1947         token->type = OP_DUP_PLUS;
1948       break;
1949     case '?':
1950       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951         token->type = OP_DUP_QUESTION;
1952       break;
1953     case '{':
1954       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1955         token->type = OP_OPEN_DUP_NUM;
1956       break;
1957     case '}':
1958       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959         token->type = OP_CLOSE_DUP_NUM;
1960       break;
1961     case '(':
1962       if (syntax & RE_NO_BK_PARENS)
1963         token->type = OP_OPEN_SUBEXP;
1964       break;
1965     case ')':
1966       if (syntax & RE_NO_BK_PARENS)
1967         token->type = OP_CLOSE_SUBEXP;
1968       break;
1969     case '[':
1970       token->type = OP_OPEN_BRACKET;
1971       break;
1972     case '.':
1973       token->type = OP_PERIOD;
1974       break;
1975     case '^':
1976       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1977           re_string_cur_idx (input) != 0)
1978         {
1979           char prev = re_string_peek_byte (input, -1);
1980           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1981             break;
1982         }
1983       token->type = ANCHOR;
1984       token->opr.ctx_type = LINE_FIRST;
1985       break;
1986     case '$':
1987       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1988           re_string_cur_idx (input) + 1 != re_string_length (input))
1989         {
1990           re_token_t next;
1991           re_string_skip_bytes (input, 1);
1992           peek_token (&next, input, syntax);
1993           re_string_skip_bytes (input, -1);
1994           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1995             break;
1996         }
1997       token->type = ANCHOR;
1998       token->opr.ctx_type = LINE_LAST;
1999       break;
2000     default:
2001       break;
2002     }
2003   return 1;
2004 }
2005
2006 /* Peek a token from INPUT, and return the length of the token.
2007    We must not use this function out of bracket expressions.  */
2008
2009 static int
2010 internal_function
2011 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2012 {
2013   unsigned char c;
2014   if (re_string_eoi (input))
2015     {
2016       token->type = END_OF_RE;
2017       return 0;
2018     }
2019   c = re_string_peek_byte (input, 0);
2020   token->opr.c = c;
2021
2022 #ifdef RE_ENABLE_I18N
2023   if (input->mb_cur_max > 1 &&
2024       !re_string_first_byte (input, re_string_cur_idx (input)))
2025     {
2026       token->type = CHARACTER;
2027       return 1;
2028     }
2029 #endif /* RE_ENABLE_I18N */
2030
2031   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2032       && re_string_cur_idx (input) + 1 < re_string_length (input))
2033     {
2034       /* In this case, '\' escape a character.  */
2035       unsigned char c2;
2036       re_string_skip_bytes (input, 1);
2037       c2 = re_string_peek_byte (input, 0);
2038       token->opr.c = c2;
2039       token->type = CHARACTER;
2040       return 1;
2041     }
2042   if (c == '[') /* '[' is a special char in a bracket exps.  */
2043     {
2044       unsigned char c2;
2045       int token_len;
2046       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2047         c2 = re_string_peek_byte (input, 1);
2048       else
2049         c2 = 0;
2050       token->opr.c = c2;
2051       token_len = 2;
2052       switch (c2)
2053         {
2054         case '.':
2055           token->type = OP_OPEN_COLL_ELEM;
2056           break;
2057         case '=':
2058           token->type = OP_OPEN_EQUIV_CLASS;
2059           break;
2060         case ':':
2061           if (syntax & RE_CHAR_CLASSES)
2062             {
2063               token->type = OP_OPEN_CHAR_CLASS;
2064               break;
2065             }
2066           /* else fall through.  */
2067         default:
2068           token->type = CHARACTER;
2069           token->opr.c = c;
2070           token_len = 1;
2071           break;
2072         }
2073       return token_len;
2074     }
2075   switch (c)
2076     {
2077     case '-':
2078       token->type = OP_CHARSET_RANGE;
2079       break;
2080     case ']':
2081       token->type = OP_CLOSE_BRACKET;
2082       break;
2083     case '^':
2084       token->type = OP_NON_MATCH_LIST;
2085       break;
2086     default:
2087       token->type = CHARACTER;
2088     }
2089   return 1;
2090 }
2091 \f
2092 /* Functions for parser.  */
2093
2094 /* Entry point of the parser.
2095    Parse the regular expression REGEXP and return the structure tree.
2096    If an error is occured, ERR is set by error code, and return NULL.
2097    This function build the following tree, from regular expression <reg_exp>:
2098            CAT
2099            / \
2100           /   \
2101    <reg_exp>  EOR
2102
2103    CAT means concatenation.
2104    EOR means end of regular expression.  */
2105
2106 static bin_tree_t *
2107 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2108        reg_errcode_t *err)
2109 {
2110   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111   bin_tree_t *tree, *eor, *root;
2112   re_token_t current_token;
2113   dfa->syntax = syntax;
2114   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2115   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2116   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2117     return NULL;
2118   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2119   if (tree != NULL)
2120     root = create_tree (dfa, tree, eor, CONCAT);
2121   else
2122     root = eor;
2123   if (BE (eor == NULL || root == NULL, 0))
2124     {
2125       *err = REG_ESPACE;
2126       return NULL;
2127     }
2128   return root;
2129 }
2130
2131 /* This function build the following tree, from regular expression
2132    <branch1>|<branch2>:
2133            ALT
2134            / \
2135           /   \
2136    <branch1> <branch2>
2137
2138    ALT means alternative, which represents the operator `|'.  */
2139
2140 static bin_tree_t *
2141 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2142                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2143 {
2144   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145   bin_tree_t *tree, *branch = NULL;
2146   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2147   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2148     return NULL;
2149
2150   while (token->type == OP_ALT)
2151     {
2152       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153       if (token->type != OP_ALT && token->type != END_OF_RE
2154           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2155         {
2156           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2157           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2158             return NULL;
2159         }
2160       else
2161         branch = NULL;
2162       tree = create_tree (dfa, tree, branch, OP_ALT);
2163       if (BE (tree == NULL, 0))
2164         {
2165           *err = REG_ESPACE;
2166           return NULL;
2167         }
2168     }
2169   return tree;
2170 }
2171
2172 /* This function build the following tree, from regular expression
2173    <exp1><exp2>:
2174         CAT
2175         / \
2176        /   \
2177    <exp1> <exp2>
2178
2179    CAT means concatenation.  */
2180
2181 static bin_tree_t *
2182 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2184 {
2185   bin_tree_t *tree, *expr;
2186   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2187   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2188   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189     return NULL;
2190
2191   while (token->type != OP_ALT && token->type != END_OF_RE
2192          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2193     {
2194       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2195       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2196         {
2197           return NULL;
2198         }
2199       if (tree != NULL && expr != NULL)
2200         {
2201           tree = create_tree (dfa, tree, expr, CONCAT);
2202           if (tree == NULL)
2203             {
2204               *err = REG_ESPACE;
2205               return NULL;
2206             }
2207         }
2208       else if (tree == NULL)
2209         tree = expr;
2210       /* Otherwise expr == NULL, we don't need to create new tree.  */
2211     }
2212   return tree;
2213 }
2214
2215 /* This function build the following tree, from regular expression a*:
2216          *
2217          |
2218          a
2219 */
2220
2221 static bin_tree_t *
2222 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2224 {
2225   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2226   bin_tree_t *tree;
2227   switch (token->type)
2228     {
2229     case CHARACTER:
2230       tree = create_token_tree (dfa, NULL, NULL, token);
2231       if (BE (tree == NULL, 0))
2232         {
2233           *err = REG_ESPACE;
2234           return NULL;
2235         }
2236 #ifdef RE_ENABLE_I18N
2237       if (dfa->mb_cur_max > 1)
2238         {
2239           while (!re_string_eoi (regexp)
2240                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2241             {
2242               bin_tree_t *mbc_remain;
2243               fetch_token (token, regexp, syntax);
2244               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2245               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2246               if (BE (mbc_remain == NULL || tree == NULL, 0))
2247                 {
2248                   *err = REG_ESPACE;
2249                   return NULL;
2250                 }
2251             }
2252         }
2253 #endif
2254       break;
2255     case OP_OPEN_SUBEXP:
2256       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2257       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258         return NULL;
2259       break;
2260     case OP_OPEN_BRACKET:
2261       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2262       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2263         return NULL;
2264       break;
2265     case OP_BACK_REF:
2266       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2267         {
2268           *err = REG_ESUBREG;
2269           return NULL;
2270         }
2271       dfa->used_bkref_map |= 1 << token->opr.idx;
2272       tree = create_token_tree (dfa, NULL, NULL, token);
2273       if (BE (tree == NULL, 0))
2274         {
2275           *err = REG_ESPACE;
2276           return NULL;
2277         }
2278       ++dfa->nbackref;
2279       dfa->has_mb_node = 1;
2280       break;
2281     case OP_OPEN_DUP_NUM:
2282       if (syntax & RE_CONTEXT_INVALID_DUP)
2283         {
2284           *err = REG_BADRPT;
2285           return NULL;
2286         }
2287       /* FALLTHROUGH */
2288     case OP_DUP_ASTERISK:
2289     case OP_DUP_PLUS:
2290     case OP_DUP_QUESTION:
2291       if (syntax & RE_CONTEXT_INVALID_OPS)
2292         {
2293           *err = REG_BADRPT;
2294           return NULL;
2295         }
2296       else if (syntax & RE_CONTEXT_INDEP_OPS)
2297         {
2298           fetch_token (token, regexp, syntax);
2299           return parse_expression (regexp, preg, token, syntax, nest, err);
2300         }
2301       /* else fall through  */
2302     case OP_CLOSE_SUBEXP:
2303       if ((token->type == OP_CLOSE_SUBEXP) &&
2304           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2305         {
2306           *err = REG_ERPAREN;
2307           return NULL;
2308         }
2309       /* else fall through  */
2310     case OP_CLOSE_DUP_NUM:
2311       /* We treat it as a normal character.  */
2312
2313       /* Then we can these characters as normal characters.  */
2314       token->type = CHARACTER;
2315       /* mb_partial and word_char bits should be initialized already
2316          by peek_token.  */
2317       tree = create_token_tree (dfa, NULL, NULL, token);
2318       if (BE (tree == NULL, 0))
2319         {
2320           *err = REG_ESPACE;
2321           return NULL;
2322         }
2323       break;
2324     case ANCHOR:
2325       if ((token->opr.ctx_type
2326            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2327           && dfa->word_ops_used == 0)
2328         init_word_char (dfa);
2329       if (token->opr.ctx_type == WORD_DELIM
2330           || token->opr.ctx_type == NOT_WORD_DELIM)
2331         {
2332           bin_tree_t *tree_first, *tree_last;
2333           if (token->opr.ctx_type == WORD_DELIM)
2334             {
2335               token->opr.ctx_type = WORD_FIRST;
2336               tree_first = create_token_tree (dfa, NULL, NULL, token);
2337               token->opr.ctx_type = WORD_LAST;
2338             }
2339           else
2340             {
2341               token->opr.ctx_type = INSIDE_WORD;
2342               tree_first = create_token_tree (dfa, NULL, NULL, token);
2343               token->opr.ctx_type = INSIDE_NOTWORD;
2344             }
2345           tree_last = create_token_tree (dfa, NULL, NULL, token);
2346           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2347           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2348             {
2349               *err = REG_ESPACE;
2350               return NULL;
2351             }
2352         }
2353       else
2354         {
2355           tree = create_token_tree (dfa, NULL, NULL, token);
2356           if (BE (tree == NULL, 0))
2357             {
2358               *err = REG_ESPACE;
2359               return NULL;
2360             }
2361         }
2362       /* We must return here, since ANCHORs can't be followed
2363          by repetition operators.
2364          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2366       fetch_token (token, regexp, syntax);
2367       return tree;
2368     case OP_PERIOD:
2369       tree = create_token_tree (dfa, NULL, NULL, token);
2370       if (BE (tree == NULL, 0))
2371         {
2372           *err = REG_ESPACE;
2373           return NULL;
2374         }
2375       if (dfa->mb_cur_max > 1)
2376         dfa->has_mb_node = 1;
2377       break;
2378     case OP_WORD:
2379     case OP_NOTWORD:
2380       tree = build_charclass_op (dfa, regexp->trans,
2381                                  (const unsigned char *) "alnum",
2382                                  (const unsigned char *) "_",
2383                                  token->type == OP_NOTWORD, err);
2384       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385         return NULL;
2386       break;
2387     case OP_SPACE:
2388     case OP_NOTSPACE:
2389       tree = build_charclass_op (dfa, regexp->trans,
2390                                  (const unsigned char *) "space",
2391                                  (const unsigned char *) "",
2392                                  token->type == OP_NOTSPACE, err);
2393       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394         return NULL;
2395       break;
2396     case OP_ALT:
2397     case END_OF_RE:
2398       return NULL;
2399     case BACK_SLASH:
2400       *err = REG_EESCAPE;
2401       return NULL;
2402     default:
2403       /* Must not happen?  */
2404 #ifdef DEBUG
2405       assert (0);
2406 #endif
2407       return NULL;
2408     }
2409   fetch_token (token, regexp, syntax);
2410
2411   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2412          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2413     {
2414       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2415       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2416         return NULL;
2417       /* In BRE consecutive duplications are not allowed.  */
2418       if ((syntax & RE_CONTEXT_INVALID_DUP)
2419           && (token->type == OP_DUP_ASTERISK
2420               || token->type == OP_OPEN_DUP_NUM))
2421         {
2422           *err = REG_BADRPT;
2423           return NULL;
2424         }
2425     }
2426
2427   return tree;
2428 }
2429
2430 /* This function build the following tree, from regular expression
2431    (<reg_exp>):
2432          SUBEXP
2433             |
2434         <reg_exp>
2435 */
2436
2437 static bin_tree_t *
2438 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2439                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2440 {
2441   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2442   bin_tree_t *tree;
2443   size_t cur_nsub;
2444   cur_nsub = preg->re_nsub++;
2445
2446   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2447
2448   /* The subexpression may be a null string.  */
2449   if (token->type == OP_CLOSE_SUBEXP)
2450     tree = NULL;
2451   else
2452     {
2453       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2454       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2455         *err = REG_EPAREN;
2456       if (BE (*err != REG_NOERROR, 0))
2457         return NULL;
2458     }
2459
2460   if (cur_nsub <= '9' - '1')
2461     dfa->completed_bkref_map |= 1 << cur_nsub;
2462
2463   tree = create_tree (dfa, tree, NULL, SUBEXP);
2464   if (BE (tree == NULL, 0))
2465     {
2466       *err = REG_ESPACE;
2467       return NULL;
2468     }
2469   tree->token.opr.idx = cur_nsub;
2470   return tree;
2471 }
2472
2473 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2474
2475 static bin_tree_t *
2476 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2477               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2478 {
2479   bin_tree_t *tree = NULL, *old_tree = NULL;
2480   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2481   re_token_t start_token = *token;
2482
2483   if (token->type == OP_OPEN_DUP_NUM)
2484     {
2485       end = 0;
2486       start = fetch_number (regexp, token, syntax);
2487       if (start == REG_MISSING)
2488         {
2489           if (token->type == CHARACTER && token->opr.c == ',')
2490             start = 0; /* We treat "{,m}" as "{0,m}".  */
2491           else
2492             {
2493               *err = REG_BADBR; /* <re>{} is invalid.  */
2494               return NULL;
2495             }
2496         }
2497       if (BE (start != REG_ERROR, 1))
2498         {
2499           /* We treat "{n}" as "{n,n}".  */
2500           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2501                  : ((token->type == CHARACTER && token->opr.c == ',')
2502                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2503         }
2504       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2505         {
2506           /* Invalid sequence.  */
2507           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2508             {
2509               if (token->type == END_OF_RE)
2510                 *err = REG_EBRACE;
2511               else
2512                 *err = REG_BADBR;
2513
2514               return NULL;
2515             }
2516
2517           /* If the syntax bit is set, rollback.  */
2518           re_string_set_index (regexp, start_idx);
2519           *token = start_token;
2520           token->type = CHARACTER;
2521           /* mb_partial and word_char bits should be already initialized by
2522              peek_token.  */
2523           return elem;
2524         }
2525
2526       if (BE ((end != REG_MISSING && start > end)
2527               || token->type != OP_CLOSE_DUP_NUM, 0))
2528         {
2529           /* First number greater than second.  */
2530           *err = REG_BADBR;
2531           return NULL;
2532         }
2533     }
2534   else
2535     {
2536       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2537       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2538     }
2539
2540   fetch_token (token, regexp, syntax);
2541
2542   if (BE (elem == NULL, 0))
2543     return NULL;
2544   if (BE (start == 0 && end == 0, 0))
2545     {
2546       postorder (elem, free_tree, NULL);
2547       return NULL;
2548     }
2549
2550   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2551   if (BE (start > 0, 0))
2552     {
2553       tree = elem;
2554       for (i = 2; i <= start; ++i)
2555         {
2556           elem = duplicate_tree (elem, dfa);
2557           tree = create_tree (dfa, tree, elem, CONCAT);
2558           if (BE (elem == NULL || tree == NULL, 0))
2559             goto parse_dup_op_espace;
2560         }
2561
2562       if (start == end)
2563         return tree;
2564
2565       /* Duplicate ELEM before it is marked optional.  */
2566       elem = duplicate_tree (elem, dfa);
2567       old_tree = tree;
2568     }
2569   else
2570     old_tree = NULL;
2571
2572   if (elem->token.type == SUBEXP)
2573     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2574
2575   tree = create_tree (dfa, elem, NULL,
2576                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2577   if (BE (tree == NULL, 0))
2578     goto parse_dup_op_espace;
2579
2580 /* From gnulib's "intprops.h":
2581    True if the arithmetic type T is signed.  */
2582 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2583
2584   /* This loop is actually executed only when end != REG_MISSING,
2585      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2586      already created the start+1-th copy.  */
2587   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2588     for (i = start + 2; i <= end; ++i)
2589       {
2590         elem = duplicate_tree (elem, dfa);
2591         tree = create_tree (dfa, tree, elem, CONCAT);
2592         if (BE (elem == NULL || tree == NULL, 0))
2593           goto parse_dup_op_espace;
2594
2595         tree = create_tree (dfa, tree, NULL, OP_ALT);
2596         if (BE (tree == NULL, 0))
2597           goto parse_dup_op_espace;
2598       }
2599
2600   if (old_tree)
2601     tree = create_tree (dfa, old_tree, tree, CONCAT);
2602
2603   return tree;
2604
2605  parse_dup_op_espace:
2606   *err = REG_ESPACE;
2607   return NULL;
2608 }
2609
2610 /* Size of the names for collating symbol/equivalence_class/character_class.
2611    I'm not sure, but maybe enough.  */
2612 #define BRACKET_NAME_BUF_SIZE 32
2613
2614 #ifndef _LIBC
2615   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2616      Build the range expression which starts from START_ELEM, and ends
2617      at END_ELEM.  The result are written to MBCSET and SBCSET.
2618      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2619      mbcset->range_ends, is a pointer argument sinse we may
2620      update it.  */
2621
2622 static reg_errcode_t
2623 internal_function
2624 # ifdef RE_ENABLE_I18N
2625 build_range_exp (const reg_syntax_t syntax,
2626                  bitset_t sbcset,
2627                  re_charset_t *mbcset,
2628                  Idx *range_alloc,
2629                  const bracket_elem_t *start_elem,
2630                  const bracket_elem_t *end_elem)
2631 # else /* not RE_ENABLE_I18N */
2632 build_range_exp (const reg_syntax_t syntax,
2633                  bitset_t sbcset,
2634                  const bracket_elem_t *start_elem,
2635                  const bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2637 {
2638   unsigned int start_ch, end_ch;
2639   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2640   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642           0))
2643     return REG_ERANGE;
2644
2645   /* We can handle no multi character collating elements without libc
2646      support.  */
2647   if (BE ((start_elem->type == COLL_SYM
2648            && strlen ((char *) start_elem->opr.name) > 1)
2649           || (end_elem->type == COLL_SYM
2650               && strlen ((char *) end_elem->opr.name) > 1), 0))
2651     return REG_ECOLLATE;
2652
2653 # ifdef RE_ENABLE_I18N
2654   {
2655     wchar_t wc;
2656     wint_t start_wc;
2657     wint_t end_wc;
2658     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659
2660     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662                    : 0));
2663     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665                  : 0));
2666     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667                 ? __btowc (start_ch) : start_elem->opr.wch);
2668     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669               ? __btowc (end_ch) : end_elem->opr.wch);
2670     if (start_wc == WEOF || end_wc == WEOF)
2671       return REG_ECOLLATE;
2672     cmp_buf[0] = start_wc;
2673     cmp_buf[4] = end_wc;
2674
2675     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2676             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2677       return REG_ERANGE;
2678
2679     /* Got valid collation sequence values, add them as a new entry.
2680        However, for !_LIBC we have no collation elements: if the
2681        character set is single byte, the single byte character set
2682        that we build below suffices.  parse_bracket_exp passes
2683        no MBCSET if dfa->mb_cur_max == 1.  */
2684     if (mbcset)
2685       {
2686         /* Check the space of the arrays.  */
2687         if (BE (*range_alloc == mbcset->nranges, 0))
2688           {
2689             /* There is not enough space, need realloc.  */
2690             wchar_t *new_array_start, *new_array_end;
2691             Idx new_nranges;
2692
2693             /* +1 in case of mbcset->nranges is 0.  */
2694             new_nranges = 2 * mbcset->nranges + 1;
2695             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2696                are NULL if *range_alloc == 0.  */
2697             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698                                           new_nranges);
2699             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2700                                         new_nranges);
2701
2702             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2703               return REG_ESPACE;
2704
2705             mbcset->range_starts = new_array_start;
2706             mbcset->range_ends = new_array_end;
2707             *range_alloc = new_nranges;
2708           }
2709
2710         mbcset->range_starts[mbcset->nranges] = start_wc;
2711         mbcset->range_ends[mbcset->nranges++] = end_wc;
2712       }
2713
2714     /* Build the table for single byte characters.  */
2715     for (wc = 0; wc < SBC_MAX; ++wc)
2716       {
2717         cmp_buf[2] = wc;
2718         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2719             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2720           bitset_set (sbcset, wc);
2721       }
2722   }
2723 # else /* not RE_ENABLE_I18N */
2724   {
2725     unsigned int ch;
2726     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2727                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2728                    : 0));
2729     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2730               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2731                  : 0));
2732     if (start_ch > end_ch)
2733       return REG_ERANGE;
2734     /* Build the table for single byte characters.  */
2735     for (ch = 0; ch < SBC_MAX; ++ch)
2736       if (start_ch <= ch  && ch <= end_ch)
2737         bitset_set (sbcset, ch);
2738   }
2739 # endif /* not RE_ENABLE_I18N */
2740   return REG_NOERROR;
2741 }
2742 #endif /* not _LIBC */
2743
2744 #ifndef _LIBC
2745 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2746    Build the collating element which is represented by NAME.
2747    The result are written to MBCSET and SBCSET.
2748    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2749    pointer argument since we may update it.  */
2750
2751 static reg_errcode_t
2752 internal_function
2753 build_collating_symbol (bitset_t sbcset,
2754 # ifdef RE_ENABLE_I18N
2755                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2756 # endif
2757                         const unsigned char *name)
2758 {
2759   size_t name_len = strlen ((const char *) name);
2760   if (BE (name_len != 1, 0))
2761     return REG_ECOLLATE;
2762   else
2763     {
2764       bitset_set (sbcset, name[0]);
2765       return REG_NOERROR;
2766     }
2767 }
2768 #endif /* not _LIBC */
2769
2770 /* This function parse bracket expression like "[abc]", "[a-c]",
2771    "[[.a-a.]]" etc.  */
2772
2773 static bin_tree_t *
2774 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2775                    reg_syntax_t syntax, reg_errcode_t *err)
2776 {
2777 #ifdef _LIBC
2778   const unsigned char *collseqmb;
2779   const char *collseqwc;
2780   uint32_t nrules;
2781   int32_t table_size;
2782   const int32_t *symb_table;
2783   const unsigned char *extra;
2784
2785   /* Local function for parse_bracket_exp used in _LIBC environement.
2786      Seek the collating symbol entry correspondings to NAME.
2787      Return the index of the symbol in the SYMB_TABLE.  */
2788
2789   auto inline int32_t
2790   __attribute ((always_inline))
2791   seek_collating_symbol_entry (name, name_len)
2792          const unsigned char *name;
2793          size_t name_len;
2794     {
2795       int32_t hash = elem_hash ((const char *) name, name_len);
2796       int32_t elem = hash % table_size;
2797       if (symb_table[2 * elem] != 0)
2798         {
2799           int32_t second = hash % (table_size - 2) + 1;
2800
2801           do
2802             {
2803               /* First compare the hashing value.  */
2804               if (symb_table[2 * elem] == hash
2805                   /* Compare the length of the name.  */
2806                   && name_len == extra[symb_table[2 * elem + 1]]
2807                   /* Compare the name.  */
2808                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2809                              name_len) == 0)
2810                 {
2811                   /* Yep, this is the entry.  */
2812                   break;
2813                 }
2814
2815               /* Next entry.  */
2816               elem += second;
2817             }
2818           while (symb_table[2 * elem] != 0);
2819         }
2820       return elem;
2821     }
2822
2823   /* Local function for parse_bracket_exp used in _LIBC environment.
2824      Look up the collation sequence value of BR_ELEM.
2825      Return the value if succeeded, UINT_MAX otherwise.  */
2826
2827   auto inline unsigned int
2828   __attribute ((always_inline))
2829   lookup_collation_sequence_value (br_elem)
2830          bracket_elem_t *br_elem;
2831     {
2832       if (br_elem->type == SB_CHAR)
2833         {
2834           /*
2835           if (MB_CUR_MAX == 1)
2836           */
2837           if (nrules == 0)
2838             return collseqmb[br_elem->opr.ch];
2839           else
2840             {
2841               wint_t wc = __btowc (br_elem->opr.ch);
2842               return __collseq_table_lookup (collseqwc, wc);
2843             }
2844         }
2845       else if (br_elem->type == MB_CHAR)
2846         {
2847           if (nrules != 0)
2848             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2849         }
2850       else if (br_elem->type == COLL_SYM)
2851         {
2852           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2853           if (nrules != 0)
2854             {
2855               int32_t elem, idx;
2856               elem = seek_collating_symbol_entry (br_elem->opr.name,
2857                                                   sym_name_len);
2858               if (symb_table[2 * elem] != 0)
2859                 {
2860                   /* We found the entry.  */
2861                   idx = symb_table[2 * elem + 1];
2862                   /* Skip the name of collating element name.  */
2863                   idx += 1 + extra[idx];
2864                   /* Skip the byte sequence of the collating element.  */
2865                   idx += 1 + extra[idx];
2866                   /* Adjust for the alignment.  */
2867                   idx = (idx + 3) & ~3;
2868                   /* Skip the multibyte collation sequence value.  */
2869                   idx += sizeof (unsigned int);
2870                   /* Skip the wide char sequence of the collating element.  */
2871                   idx += sizeof (unsigned int) *
2872                     (1 + *(unsigned int *) (extra + idx));
2873                   /* Return the collation sequence value.  */
2874                   return *(unsigned int *) (extra + idx);
2875                 }
2876               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2877                 {
2878                   /* No valid character.  Match it as a single byte
2879                      character.  */
2880                   return collseqmb[br_elem->opr.name[0]];
2881                 }
2882             }
2883           else if (sym_name_len == 1)
2884             return collseqmb[br_elem->opr.name[0]];
2885         }
2886       return UINT_MAX;
2887     }
2888
2889   /* Local function for parse_bracket_exp used in _LIBC environement.
2890      Build the range expression which starts from START_ELEM, and ends
2891      at END_ELEM.  The result are written to MBCSET and SBCSET.
2892      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2893      mbcset->range_ends, is a pointer argument sinse we may
2894      update it.  */
2895
2896   auto inline reg_errcode_t
2897   __attribute ((always_inline))
2898   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2899          re_charset_t *mbcset;
2900          Idx *range_alloc;
2901          bitset_t sbcset;
2902          bracket_elem_t *start_elem, *end_elem;
2903     {
2904       unsigned int ch;
2905       uint32_t start_collseq;
2906       uint32_t end_collseq;
2907
2908       /* Equivalence Classes and Character Classes can't be a range
2909          start/end.  */
2910       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2911               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2912               0))
2913         return REG_ERANGE;
2914
2915       start_collseq = lookup_collation_sequence_value (start_elem);
2916       end_collseq = lookup_collation_sequence_value (end_elem);
2917       /* Check start/end collation sequence values.  */
2918       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2919         return REG_ECOLLATE;
2920       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2921         return REG_ERANGE;
2922
2923       /* Got valid collation sequence values, add them as a new entry.
2924          However, if we have no collation elements, and the character set
2925          is single byte, the single byte character set that we
2926          build below suffices. */
2927       if (nrules > 0 || dfa->mb_cur_max > 1)
2928         {
2929           /* Check the space of the arrays.  */
2930           if (BE (*range_alloc == mbcset->nranges, 0))
2931             {
2932               /* There is not enough space, need realloc.  */
2933               uint32_t *new_array_start;
2934               uint32_t *new_array_end;
2935               Idx new_nranges;
2936
2937               /* +1 in case of mbcset->nranges is 0.  */
2938               new_nranges = 2 * mbcset->nranges + 1;
2939               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2940                                             new_nranges);
2941               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2942                                           new_nranges);
2943
2944               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2945                 return REG_ESPACE;
2946
2947               mbcset->range_starts = new_array_start;
2948               mbcset->range_ends = new_array_end;
2949               *range_alloc = new_nranges;
2950             }
2951
2952           mbcset->range_starts[mbcset->nranges] = start_collseq;
2953           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2954         }
2955
2956       /* Build the table for single byte characters.  */
2957       for (ch = 0; ch < SBC_MAX; ch++)
2958         {
2959           uint32_t ch_collseq;
2960           /*
2961           if (MB_CUR_MAX == 1)
2962           */
2963           if (nrules == 0)
2964             ch_collseq = collseqmb[ch];
2965           else
2966             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2967           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2968             bitset_set (sbcset, ch);
2969         }
2970       return REG_NOERROR;
2971     }
2972
2973   /* Local function for parse_bracket_exp used in _LIBC environement.
2974      Build the collating element which is represented by NAME.
2975      The result are written to MBCSET and SBCSET.
2976      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2977      pointer argument sinse we may update it.  */
2978
2979   auto inline reg_errcode_t
2980   __attribute ((always_inline))
2981   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2982          re_charset_t *mbcset;
2983          Idx *coll_sym_alloc;
2984          bitset_t sbcset;
2985          const unsigned char *name;
2986     {
2987       int32_t elem, idx;
2988       size_t name_len = strlen ((const char *) name);
2989       if (nrules != 0)
2990         {
2991           elem = seek_collating_symbol_entry (name, name_len);
2992           if (symb_table[2 * elem] != 0)
2993             {
2994               /* We found the entry.  */
2995               idx = symb_table[2 * elem + 1];
2996               /* Skip the name of collating element name.  */
2997               idx += 1 + extra[idx];
2998             }
2999           else if (symb_table[2 * elem] == 0 && name_len == 1)
3000             {
3001               /* No valid character, treat it as a normal
3002                  character.  */
3003               bitset_set (sbcset, name[0]);
3004               return REG_NOERROR;
3005             }
3006           else
3007             return REG_ECOLLATE;
3008
3009           /* Got valid collation sequence, add it as a new entry.  */
3010           /* Check the space of the arrays.  */
3011           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3012             {
3013               /* Not enough, realloc it.  */
3014               /* +1 in case of mbcset->ncoll_syms is 0.  */
3015               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3016               /* Use realloc since mbcset->coll_syms is NULL
3017                  if *alloc == 0.  */
3018               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3019                                                    new_coll_sym_alloc);
3020               if (BE (new_coll_syms == NULL, 0))
3021                 return REG_ESPACE;
3022               mbcset->coll_syms = new_coll_syms;
3023               *coll_sym_alloc = new_coll_sym_alloc;
3024             }
3025           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3026           return REG_NOERROR;
3027         }
3028       else
3029         {
3030           if (BE (name_len != 1, 0))
3031             return REG_ECOLLATE;
3032           else
3033             {
3034               bitset_set (sbcset, name[0]);
3035               return REG_NOERROR;
3036             }
3037         }
3038     }
3039 #endif
3040
3041   re_token_t br_token;
3042   re_bitset_ptr_t sbcset;
3043 #ifdef RE_ENABLE_I18N
3044   re_charset_t *mbcset;
3045   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3046   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3047 #endif /* not RE_ENABLE_I18N */
3048   bool non_match = false;
3049   bin_tree_t *work_tree;
3050   int token_len;
3051   bool first_round = true;
3052 #ifdef _LIBC
3053   collseqmb = (const unsigned char *)
3054     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3055   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3056   if (nrules)
3057     {
3058       /*
3059       if (MB_CUR_MAX > 1)
3060       */
3061       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3062       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3063       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3064                                                   _NL_COLLATE_SYMB_TABLEMB);
3065       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3066                                                    _NL_COLLATE_SYMB_EXTRAMB);
3067     }
3068 #endif
3069   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3070 #ifdef RE_ENABLE_I18N
3071   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3072 #endif /* RE_ENABLE_I18N */
3073 #ifdef RE_ENABLE_I18N
3074   if (BE (sbcset == NULL || mbcset == NULL, 0))
3075 #else
3076   if (BE (sbcset == NULL, 0))
3077 #endif /* RE_ENABLE_I18N */
3078     {
3079       *err = REG_ESPACE;
3080       return NULL;
3081     }
3082
3083   token_len = peek_token_bracket (token, regexp, syntax);
3084   if (BE (token->type == END_OF_RE, 0))
3085     {
3086       *err = REG_BADPAT;
3087       goto parse_bracket_exp_free_return;
3088     }
3089   if (token->type == OP_NON_MATCH_LIST)
3090     {
3091 #ifdef RE_ENABLE_I18N
3092       mbcset->non_match = 1;
3093 #endif /* not RE_ENABLE_I18N */
3094       non_match = true;
3095       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3096         bitset_set (sbcset, '\n');
3097       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3098       token_len = peek_token_bracket (token, regexp, syntax);
3099       if (BE (token->type == END_OF_RE, 0))
3100         {
3101           *err = REG_BADPAT;
3102           goto parse_bracket_exp_free_return;
3103         }
3104     }
3105
3106   /* We treat the first ']' as a normal character.  */
3107   if (token->type == OP_CLOSE_BRACKET)
3108     token->type = CHARACTER;
3109
3110   while (1)
3111     {
3112       bracket_elem_t start_elem, end_elem;
3113       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3114       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3115       reg_errcode_t ret;
3116       int token_len2 = 0;
3117       bool is_range_exp = false;
3118       re_token_t token2;
3119
3120       start_elem.opr.name = start_name_buf;
3121       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3122                                    syntax, first_round);
3123       if (BE (ret != REG_NOERROR, 0))
3124         {
3125           *err = ret;
3126           goto parse_bracket_exp_free_return;
3127         }
3128       first_round = false;
3129
3130       /* Get information about the next token.  We need it in any case.  */
3131       token_len = peek_token_bracket (token, regexp, syntax);
3132
3133       /* Do not check for ranges if we know they are not allowed.  */
3134       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3135         {
3136           if (BE (token->type == END_OF_RE, 0))
3137             {
3138               *err = REG_EBRACK;
3139               goto parse_bracket_exp_free_return;
3140             }
3141           if (token->type == OP_CHARSET_RANGE)
3142             {
3143               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3144               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3145               if (BE (token2.type == END_OF_RE, 0))
3146                 {
3147                   *err = REG_EBRACK;
3148                   goto parse_bracket_exp_free_return;
3149                 }
3150               if (token2.type == OP_CLOSE_BRACKET)
3151                 {
3152                   /* We treat the last '-' as a normal character.  */
3153                   re_string_skip_bytes (regexp, -token_len);
3154                   token->type = CHARACTER;
3155                 }
3156               else
3157                 is_range_exp = true;
3158             }
3159         }
3160
3161       if (is_range_exp == true)
3162         {
3163           end_elem.opr.name = end_name_buf;
3164           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3165                                        dfa, syntax, true);
3166           if (BE (ret != REG_NOERROR, 0))
3167             {
3168               *err = ret;
3169               goto parse_bracket_exp_free_return;
3170             }
3171
3172           token_len = peek_token_bracket (token, regexp, syntax);
3173
3174 #ifdef _LIBC
3175           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3176                                   &start_elem, &end_elem);
3177 #else
3178 # ifdef RE_ENABLE_I18N
3179           *err = build_range_exp (syntax, sbcset,
3180                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3181                                   &range_alloc, &start_elem, &end_elem);
3182 # else
3183           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3184 # endif
3185 #endif /* RE_ENABLE_I18N */
3186           if (BE (*err != REG_NOERROR, 0))
3187             goto parse_bracket_exp_free_return;
3188         }
3189       else
3190         {
3191           switch (start_elem.type)
3192             {
3193             case SB_CHAR:
3194               bitset_set (sbcset, start_elem.opr.ch);
3195               break;
3196 #ifdef RE_ENABLE_I18N
3197             case MB_CHAR:
3198               /* Check whether the array has enough space.  */
3199               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3200                 {
3201                   wchar_t *new_mbchars;
3202                   /* Not enough, realloc it.  */
3203                   /* +1 in case of mbcset->nmbchars is 0.  */
3204                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3205                   /* Use realloc since array is NULL if *alloc == 0.  */
3206                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3207                                             mbchar_alloc);
3208                   if (BE (new_mbchars == NULL, 0))
3209                     goto parse_bracket_exp_espace;
3210                   mbcset->mbchars = new_mbchars;
3211                 }
3212               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3213               break;
3214 #endif /* RE_ENABLE_I18N */
3215             case EQUIV_CLASS:
3216               *err = build_equiv_class (sbcset,
3217 #ifdef RE_ENABLE_I18N
3218                                         mbcset, &equiv_class_alloc,
3219 #endif /* RE_ENABLE_I18N */
3220                                         start_elem.opr.name);
3221               if (BE (*err != REG_NOERROR, 0))
3222                 goto parse_bracket_exp_free_return;
3223               break;
3224             case COLL_SYM:
3225               *err = build_collating_symbol (sbcset,
3226 #ifdef RE_ENABLE_I18N
3227                                              mbcset, &coll_sym_alloc,
3228 #endif /* RE_ENABLE_I18N */
3229                                              start_elem.opr.name);
3230               if (BE (*err != REG_NOERROR, 0))
3231                 goto parse_bracket_exp_free_return;
3232               break;
3233             case CHAR_CLASS:
3234               *err = build_charclass (regexp->trans, sbcset,
3235 #ifdef RE_ENABLE_I18N
3236                                       mbcset, &char_class_alloc,
3237 #endif /* RE_ENABLE_I18N */
3238                                       start_elem.opr.name, syntax);
3239               if (BE (*err != REG_NOERROR, 0))
3240                goto parse_bracket_exp_free_return;
3241               break;
3242             default:
3243               assert (0);
3244               break;
3245             }
3246         }
3247       if (BE (token->type == END_OF_RE, 0))
3248         {
3249           *err = REG_EBRACK;
3250           goto parse_bracket_exp_free_return;
3251         }
3252       if (token->type == OP_CLOSE_BRACKET)
3253         break;
3254     }
3255
3256   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3257
3258   /* If it is non-matching list.  */
3259   if (non_match)
3260     bitset_not (sbcset);
3261
3262 #ifdef RE_ENABLE_I18N
3263   /* Ensure only single byte characters are set.  */
3264   if (dfa->mb_cur_max > 1)
3265     bitset_mask (sbcset, dfa->sb_char);
3266
3267   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3268       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3269                                                      || mbcset->non_match)))
3270     {
3271       bin_tree_t *mbc_tree;
3272       int sbc_idx;
3273       /* Build a tree for complex bracket.  */
3274       dfa->has_mb_node = 1;
3275       br_token.type = COMPLEX_BRACKET;
3276       br_token.opr.mbcset = mbcset;
3277       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3278       if (BE (mbc_tree == NULL, 0))
3279         goto parse_bracket_exp_espace;
3280       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3281         if (sbcset[sbc_idx])
3282           break;
3283       /* If there are no bits set in sbcset, there is no point
3284          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3285       if (sbc_idx < BITSET_WORDS)
3286         {
3287           /* Build a tree for simple bracket.  */
3288           br_token.type = SIMPLE_BRACKET;
3289           br_token.opr.sbcset = sbcset;
3290           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291           if (BE (work_tree == NULL, 0))
3292             goto parse_bracket_exp_espace;
3293
3294           /* Then join them by ALT node.  */
3295           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3296           if (BE (work_tree == NULL, 0))
3297             goto parse_bracket_exp_espace;
3298         }
3299       else
3300         {
3301           re_free (sbcset);
3302           work_tree = mbc_tree;
3303         }
3304     }
3305   else
3306 #endif /* not RE_ENABLE_I18N */
3307     {
3308 #ifdef RE_ENABLE_I18N
3309       free_charset (mbcset);
3310 #endif
3311       /* Build a tree for simple bracket.  */
3312       br_token.type = SIMPLE_BRACKET;
3313       br_token.opr.sbcset = sbcset;
3314       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3315       if (BE (work_tree == NULL, 0))
3316         goto parse_bracket_exp_espace;
3317     }
3318   return work_tree;
3319
3320  parse_bracket_exp_espace:
3321   *err = REG_ESPACE;
3322  parse_bracket_exp_free_return:
3323   re_free (sbcset);
3324 #ifdef RE_ENABLE_I18N
3325   free_charset (mbcset);
3326 #endif /* RE_ENABLE_I18N */
3327   return NULL;
3328 }
3329
3330 /* Parse an element in the bracket expression.  */
3331
3332 static reg_errcode_t
3333 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3334                        re_token_t *token, int token_len, re_dfa_t *dfa,
3335                        reg_syntax_t syntax, bool accept_hyphen)
3336 {
3337 #ifdef RE_ENABLE_I18N
3338   int cur_char_size;
3339   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3340   if (cur_char_size > 1)
3341     {
3342       elem->type = MB_CHAR;
3343       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3344       re_string_skip_bytes (regexp, cur_char_size);
3345       return REG_NOERROR;
3346     }
3347 #endif /* RE_ENABLE_I18N */
3348   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3349   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3350       || token->type == OP_OPEN_EQUIV_CLASS)
3351     return parse_bracket_symbol (elem, regexp, token);
3352   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3353     {
3354       /* A '-' must only appear as anything but a range indicator before
3355          the closing bracket.  Everything else is an error.  */
3356       re_token_t token2;
3357       (void) peek_token_bracket (&token2, regexp, syntax);
3358       if (token2.type != OP_CLOSE_BRACKET)
3359         /* The actual error value is not standardized since this whole
3360            case is undefined.  But ERANGE makes good sense.  */
3361         return REG_ERANGE;
3362     }
3363   elem->type = SB_CHAR;
3364   elem->opr.ch = token->opr.c;
3365   return REG_NOERROR;
3366 }
3367
3368 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3369    such as [:<character_class>:], [.<collating_element>.], and
3370    [=<equivalent_class>=].  */
3371
3372 static reg_errcode_t
3373 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3374                       re_token_t *token)
3375 {
3376   unsigned char ch, delim = token->opr.c;
3377   int i = 0;
3378   if (re_string_eoi(regexp))
3379     return REG_EBRACK;
3380   for (;; ++i)
3381     {
3382       if (i >= BRACKET_NAME_BUF_SIZE)
3383         return REG_EBRACK;
3384       if (token->type == OP_OPEN_CHAR_CLASS)
3385         ch = re_string_fetch_byte_case (regexp);
3386       else
3387         ch = re_string_fetch_byte (regexp);
3388       if (re_string_eoi(regexp))
3389         return REG_EBRACK;
3390       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3391         break;
3392       elem->opr.name[i] = ch;
3393     }
3394   re_string_skip_bytes (regexp, 1);
3395   elem->opr.name[i] = '\0';
3396   switch (token->type)
3397     {
3398     case OP_OPEN_COLL_ELEM:
3399       elem->type = COLL_SYM;
3400       break;
3401     case OP_OPEN_EQUIV_CLASS:
3402       elem->type = EQUIV_CLASS;
3403       break;
3404     case OP_OPEN_CHAR_CLASS:
3405       elem->type = CHAR_CLASS;
3406       break;
3407     default:
3408       break;
3409     }
3410   return REG_NOERROR;
3411 }
3412
3413   /* Helper function for parse_bracket_exp.
3414      Build the equivalence class which is represented by NAME.
3415      The result are written to MBCSET and SBCSET.
3416      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3417      is a pointer argument sinse we may update it.  */
3418
3419 static reg_errcode_t
3420 #ifdef RE_ENABLE_I18N
3421 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3422                    Idx *equiv_class_alloc, const unsigned char *name)
3423 #else /* not RE_ENABLE_I18N */
3424 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3425 #endif /* not RE_ENABLE_I18N */
3426 {
3427 #ifdef _LIBC
3428   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3429   if (nrules != 0)
3430     {
3431       const int32_t *table, *indirect;
3432       const unsigned char *weights, *extra, *cp;
3433       unsigned char char_buf[2];
3434       int32_t idx1, idx2;
3435       unsigned int ch;
3436       size_t len;
3437       /* This #include defines a local function!  */
3438 # include <locale/weight.h>
3439       /* Calculate the index for equivalence class.  */
3440       cp = name;
3441       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3442       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3443                                                _NL_COLLATE_WEIGHTMB);
3444       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3445                                                    _NL_COLLATE_EXTRAMB);
3446       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3447                                                 _NL_COLLATE_INDIRECTMB);
3448       idx1 = findidx (&cp);
3449       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3450         /* This isn't a valid character.  */
3451         return REG_ECOLLATE;
3452
3453       /* Build single byte matcing table for this equivalence class.  */
3454       char_buf[1] = (unsigned char) '\0';
3455       len = weights[idx1 & 0xffffff];
3456       for (ch = 0; ch < SBC_MAX; ++ch)
3457         {
3458           char_buf[0] = ch;
3459           cp = char_buf;
3460           idx2 = findidx (&cp);
3461 /*
3462           idx2 = table[ch];
3463 */
3464           if (idx2 == 0)
3465             /* This isn't a valid character.  */
3466             continue;
3467           /* Compare only if the length matches and the collation rule
3468              index is the same.  */
3469           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3470             {
3471               int cnt = 0;
3472
3473               while (cnt <= len &&
3474                      weights[(idx1 & 0xffffff) + 1 + cnt]
3475                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3476                 ++cnt;
3477
3478               if (cnt > len)
3479                 bitset_set (sbcset, ch);
3480             }
3481         }
3482       /* Check whether the array has enough space.  */
3483       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3484         {
3485           /* Not enough, realloc it.  */
3486           /* +1 in case of mbcset->nequiv_classes is 0.  */
3487           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3488           /* Use realloc since the array is NULL if *alloc == 0.  */
3489           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3490                                                    int32_t,
3491                                                    new_equiv_class_alloc);
3492           if (BE (new_equiv_classes == NULL, 0))
3493             return REG_ESPACE;
3494           mbcset->equiv_classes = new_equiv_classes;
3495           *equiv_class_alloc = new_equiv_class_alloc;
3496         }
3497       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3498     }
3499   else
3500 #endif /* _LIBC */
3501     {
3502       if (BE (strlen ((const char *) name) != 1, 0))
3503         return REG_ECOLLATE;
3504       bitset_set (sbcset, *name);
3505     }
3506   return REG_NOERROR;
3507 }
3508
3509   /* Helper function for parse_bracket_exp.
3510      Build the character class which is represented by NAME.
3511      The result are written to MBCSET and SBCSET.
3512      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3513      is a pointer argument sinse we may update it.  */
3514
3515 static reg_errcode_t
3516 #ifdef RE_ENABLE_I18N
3517 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3518                  re_charset_t *mbcset, Idx *char_class_alloc,
3519                  const unsigned char *class_name, reg_syntax_t syntax)
3520 #else /* not RE_ENABLE_I18N */
3521 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522                  const unsigned char *class_name, reg_syntax_t syntax)
3523 #endif /* not RE_ENABLE_I18N */
3524 {
3525   int i;
3526   const char *name = (const char *) class_name;
3527
3528   /* In case of REG_ICASE "upper" and "lower" match the both of
3529      upper and lower cases.  */
3530   if ((syntax & RE_ICASE)
3531       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3532     name = "alpha";
3533
3534 #ifdef RE_ENABLE_I18N
3535   /* Check the space of the arrays.  */
3536   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3537     {
3538       /* Not enough, realloc it.  */
3539       /* +1 in case of mbcset->nchar_classes is 0.  */
3540       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3541       /* Use realloc since array is NULL if *alloc == 0.  */
3542       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3543                                                new_char_class_alloc);
3544       if (BE (new_char_classes == NULL, 0))
3545         return REG_ESPACE;
3546       mbcset->char_classes = new_char_classes;
3547       *char_class_alloc = new_char_class_alloc;
3548     }
3549   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3550 #endif /* RE_ENABLE_I18N */
3551
3552 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3553   do {                                          \
3554     if (BE (trans != NULL, 0))                  \
3555       {                                         \
3556         for (i = 0; i < SBC_MAX; ++i)           \
3557           if (ctype_func (i))                   \
3558             bitset_set (sbcset, trans[i]);      \
3559       }                                         \
3560     else                                        \
3561       {                                         \
3562         for (i = 0; i < SBC_MAX; ++i)           \
3563           if (ctype_func (i))                   \
3564             bitset_set (sbcset, i);             \
3565       }                                         \
3566   } while (0)
3567
3568   if (strcmp (name, "alnum") == 0)
3569     BUILD_CHARCLASS_LOOP (isalnum);
3570   else if (strcmp (name, "cntrl") == 0)
3571     BUILD_CHARCLASS_LOOP (iscntrl);
3572   else if (strcmp (name, "lower") == 0)
3573     BUILD_CHARCLASS_LOOP (islower);
3574   else if (strcmp (name, "space") == 0)
3575     BUILD_CHARCLASS_LOOP (isspace);
3576   else if (strcmp (name, "alpha") == 0)
3577     BUILD_CHARCLASS_LOOP (isalpha);
3578   else if (strcmp (name, "digit") == 0)
3579     BUILD_CHARCLASS_LOOP (isdigit);
3580   else if (strcmp (name, "print") == 0)
3581     BUILD_CHARCLASS_LOOP (isprint);
3582   else if (strcmp (name, "upper") == 0)
3583     BUILD_CHARCLASS_LOOP (isupper);
3584   else if (strcmp (name, "blank") == 0)
3585     BUILD_CHARCLASS_LOOP (isblank);
3586   else if (strcmp (name, "graph") == 0)
3587     BUILD_CHARCLASS_LOOP (isgraph);
3588   else if (strcmp (name, "punct") == 0)
3589     BUILD_CHARCLASS_LOOP (ispunct);
3590   else if (strcmp (name, "xdigit") == 0)
3591     BUILD_CHARCLASS_LOOP (isxdigit);
3592   else
3593     return REG_ECTYPE;
3594
3595   return REG_NOERROR;
3596 }
3597
3598 static bin_tree_t *
3599 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3600                     const unsigned char *class_name,
3601                     const unsigned char *extra, bool non_match,
3602                     reg_errcode_t *err)
3603 {
3604   re_bitset_ptr_t sbcset;
3605 #ifdef RE_ENABLE_I18N
3606   re_charset_t *mbcset;
3607   Idx alloc = 0;
3608 #endif /* not RE_ENABLE_I18N */
3609   reg_errcode_t ret;
3610   re_token_t br_token;
3611   bin_tree_t *tree;
3612
3613   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3614 #ifdef RE_ENABLE_I18N
3615   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3616 #endif /* RE_ENABLE_I18N */
3617
3618 #ifdef RE_ENABLE_I18N
3619   if (BE (sbcset == NULL || mbcset == NULL, 0))
3620 #else /* not RE_ENABLE_I18N */
3621   if (BE (sbcset == NULL, 0))
3622 #endif /* not RE_ENABLE_I18N */
3623     {
3624       *err = REG_ESPACE;
3625       return NULL;
3626     }
3627
3628   if (non_match)
3629     {
3630 #ifdef RE_ENABLE_I18N
3631       mbcset->non_match = 1;
3632 #endif /* not RE_ENABLE_I18N */
3633     }
3634
3635   /* We don't care the syntax in this case.  */
3636   ret = build_charclass (trans, sbcset,
3637 #ifdef RE_ENABLE_I18N
3638                          mbcset, &alloc,
3639 #endif /* RE_ENABLE_I18N */
3640                          class_name, 0);
3641
3642   if (BE (ret != REG_NOERROR, 0))
3643     {
3644       re_free (sbcset);
3645 #ifdef RE_ENABLE_I18N
3646       free_charset (mbcset);
3647 #endif /* RE_ENABLE_I18N */
3648       *err = ret;
3649       return NULL;
3650     }
3651   /* \w match '_' also.  */
3652   for (; *extra; extra++)
3653     bitset_set (sbcset, *extra);
3654
3655   /* If it is non-matching list.  */
3656   if (non_match)
3657     bitset_not (sbcset);
3658
3659 #ifdef RE_ENABLE_I18N
3660   /* Ensure only single byte characters are set.  */
3661   if (dfa->mb_cur_max > 1)
3662     bitset_mask (sbcset, dfa->sb_char);
3663 #endif
3664
3665   /* Build a tree for simple bracket.  */
3666   br_token.type = SIMPLE_BRACKET;
3667   br_token.opr.sbcset = sbcset;
3668   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3669   if (BE (tree == NULL, 0))
3670     goto build_word_op_espace;
3671
3672 #ifdef RE_ENABLE_I18N
3673   if (dfa->mb_cur_max > 1)
3674     {
3675       bin_tree_t *mbc_tree;
3676       /* Build a tree for complex bracket.  */
3677       br_token.type = COMPLEX_BRACKET;
3678       br_token.opr.mbcset = mbcset;
3679       dfa->has_mb_node = 1;
3680       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3681       if (BE (mbc_tree == NULL, 0))
3682         goto build_word_op_espace;
3683       /* Then join them by ALT node.  */
3684       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3685       if (BE (mbc_tree != NULL, 1))
3686         return tree;
3687     }
3688   else
3689     {
3690       free_charset (mbcset);
3691       return tree;
3692     }
3693 #else /* not RE_ENABLE_I18N */
3694   return tree;
3695 #endif /* not RE_ENABLE_I18N */
3696
3697  build_word_op_espace:
3698   re_free (sbcset);
3699 #ifdef RE_ENABLE_I18N
3700   free_charset (mbcset);
3701 #endif /* RE_ENABLE_I18N */
3702   *err = REG_ESPACE;
3703   return NULL;
3704 }
3705
3706 /* This is intended for the expressions like "a{1,3}".
3707    Fetch a number from `input', and return the number.
3708    Return REG_MISSING if the number field is empty like "{,1}".
3709    Return REG_ERROR if an error occurred.  */
3710
3711 static Idx
3712 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3713 {
3714   Idx num = REG_MISSING;
3715   unsigned char c;
3716   while (1)
3717     {
3718       fetch_token (token, input, syntax);
3719       c = token->opr.c;
3720       if (BE (token->type == END_OF_RE, 0))
3721         return REG_ERROR;
3722       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3723         break;
3724       num = ((token->type != CHARACTER || c < '0' || '9' < c
3725               || num == REG_ERROR)
3726              ? REG_ERROR
3727              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3728       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3729     }
3730   return num;
3731 }
3732 \f
3733 #ifdef RE_ENABLE_I18N
3734 static void
3735 free_charset (re_charset_t *cset)
3736 {
3737   re_free (cset->mbchars);
3738 # ifdef _LIBC
3739   re_free (cset->coll_syms);
3740   re_free (cset->equiv_classes);
3741   re_free (cset->range_starts);
3742   re_free (cset->range_ends);
3743 # endif
3744   re_free (cset->char_classes);
3745   re_free (cset);
3746 }
3747 #endif /* RE_ENABLE_I18N */
3748 \f
3749 /* Functions for binary tree operation.  */
3750
3751 /* Create a tree node.  */
3752
3753 static bin_tree_t *
3754 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3755              re_token_type_t type)
3756 {
3757   re_token_t t;
3758   t.type = type;
3759   return create_token_tree (dfa, left, right, &t);
3760 }
3761
3762 static bin_tree_t *
3763 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764                    const re_token_t *token)
3765 {
3766   bin_tree_t *tree;
3767   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3768     {
3769       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3770
3771       if (storage == NULL)
3772         return NULL;
3773       storage->next = dfa->str_tree_storage;
3774       dfa->str_tree_storage = storage;
3775       dfa->str_tree_storage_idx = 0;
3776     }
3777   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3778
3779   tree->parent = NULL;
3780   tree->left = left;
3781   tree->right = right;
3782   tree->token = *token;
3783   tree->token.duplicated = 0;
3784   tree->token.opt_subexp = 0;
3785   tree->first = NULL;
3786   tree->next = NULL;
3787   tree->node_idx = REG_MISSING;
3788
3789   if (left != NULL)
3790     left->parent = tree;
3791   if (right != NULL)
3792     right->parent = tree;
3793   return tree;
3794 }
3795
3796 /* Mark the tree SRC as an optional subexpression.
3797    To be called from preorder or postorder.  */
3798
3799 static reg_errcode_t
3800 mark_opt_subexp (void *extra, bin_tree_t *node)
3801 {
3802   Idx idx = (Idx) (long) extra;
3803   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3804     node->token.opt_subexp = 1;
3805
3806   return REG_NOERROR;
3807 }
3808
3809 /* Free the allocated memory inside NODE. */
3810
3811 static void
3812 free_token (re_token_t *node)
3813 {
3814 #ifdef RE_ENABLE_I18N
3815   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3816     free_charset (node->opr.mbcset);
3817   else
3818 #endif /* RE_ENABLE_I18N */
3819     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3820       re_free (node->opr.sbcset);
3821 }
3822
3823 /* Worker function for tree walking.  Free the allocated memory inside NODE
3824    and its children. */
3825
3826 static reg_errcode_t
3827 free_tree (void *extra, bin_tree_t *node)
3828 {
3829   free_token (&node->token);
3830   return REG_NOERROR;
3831 }
3832
3833
3834 /* Duplicate the node SRC, and return new node.  This is a preorder
3835    visit similar to the one implemented by the generic visitor, but
3836    we need more infrastructure to maintain two parallel trees --- so,
3837    it's easier to duplicate.  */
3838
3839 static bin_tree_t *
3840 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3841 {
3842   const bin_tree_t *node;
3843   bin_tree_t *dup_root;
3844   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3845
3846   for (node = root; ; )
3847     {
3848       /* Create a new tree and link it back to the current parent.  */
3849       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3850       if (*p_new == NULL)
3851         return NULL;
3852       (*p_new)->parent = dup_node;
3853       (*p_new)->token.duplicated = 1;
3854       dup_node = *p_new;
3855
3856       /* Go to the left node, or up and to the right.  */
3857       if (node->left)
3858         {
3859           node = node->left;
3860           p_new = &dup_node->left;
3861         }
3862       else
3863         {
3864           const bin_tree_t *prev = NULL;
3865           while (node->right == prev || node->right == NULL)
3866             {
3867               prev = node;
3868               node = node->parent;
3869               dup_node = dup_node->parent;
3870               if (!node)
3871                 return dup_root;
3872             }
3873           node = node->right;
3874           p_new = &dup_node->right;
3875         }
3876     }
3877 }