git.gag.com Git - debian/tar/blob - gnu/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2015 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public
   8    License as published by the Free Software Foundation; either
   9    version 3 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const char *class_name,
 110                                        const char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute__ ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 337                              &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 360           Idx i;
 361
 362 # ifdef _LIBC
 363           /* See if we have to try all bytes which start multiple collation
 364              elements.
 365              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 366                   collation element, and don't catch 'b' since 'b' is
 367                   the only collation element which starts from 'b' (and
 368                   it is caught by SIMPLE_BRACKET).  */
 369               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 370                   && (cset->ncoll_syms || cset->nranges))
 371                 {
 372                   const int32_t *table = (const int32_t *)
 373                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 374                   for (i = 0; i < SBC_MAX; ++i)
 375                     if (table[i] < 0)
 376                       re_set_fastmap (fastmap, icase, i);
 377                 }
 378 # endif /* _LIBC */
 379
 380           /* See if we have to start the match at all multibyte characters,
 381              i.e. where we would not find an invalid sequence.  This only
 382              applies to multibyte character sets; for single byte character
 383              sets, the SIMPLE_BRACKET again suffices.  */
 384           if (dfa->mb_cur_max > 1
 385               && (cset->nchar_classes || cset->non_match || cset->nranges
 386 # ifdef _LIBC
 387                   || cset->nequiv_classes
 388 # endif /* _LIBC */
 389                  ))
 390             {
 391               unsigned char c = 0;
 392               do
 393                 {
 394                   mbstate_t mbs;
 395                   memset (&mbs, 0, sizeof (mbs));
 396                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 397                     re_set_fastmap (fastmap, false, (int) c);
 398                 }
 399               while (++c != 0);
 400             }
 401
 402           else
 403             {
 404               /* ... Else catch all bytes which can start the mbchars.  */
 405               for (i = 0; i < cset->nmbchars; ++i)
 406                 {
 407                   char buf[256];
 408                   mbstate_t state;
 409                   memset (&state, '\0', sizeof (state));
 410                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 411                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                     {
 414                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 415                           != (size_t) -1)
 416                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 417                     }
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      'buffer' to the compiled pattern;
 443      'used' to the length of the compiled pattern;
 444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      'fastmap' to an allocated space for the fastmap;
 449      'fastmap_accurate' to zero;
 450      're_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *_Restrict_ preg;
 475     const char *_Restrict_ pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 #ifdef _LIBC
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *_Restrict_ preg;
 539     char *_Restrict_ errbuf;
 540     size_t errbuf_size;
 541 #else /* size_t might promote */
 542 size_t
 543 regerror (int errcode, const regex_t *_Restrict_ preg,
 544           char *_Restrict_ errbuf, size_t errbuf_size)
 545 #endif
 546 {
 547   const char *msg;
 548   size_t msg_size;
 549
 550   if (BE (errcode < 0
 551           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 552                                / sizeof (__re_error_msgid_idx[0])), 0))
 553     /* Only error codes returned by the rest of the code should be passed
 554        to this routine.  If we are given anything else, or if other regex
 555        code generates an invalid error code, then the program has a bug.
 556        Dump core so we can fix it.  */
 557     abort ();
 558
 559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 560
 561   msg_size = strlen (msg) + 1; /* Includes the null.  */
 562
 563   if (BE (errbuf_size != 0, 1))
 564     {
 565       size_t cpy_size = msg_size;
 566       if (BE (msg_size > errbuf_size, 0))
 567         {
 568           cpy_size = errbuf_size - 1;
 569           errbuf[cpy_size] = '\0';
 570         }
 571       memcpy (errbuf, msg, cpy_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset_t utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589 # if defined __GNUC__ && !defined __STRICT_ANSI__
 590   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 591 # else
 592 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #   error "bitset_word_t is narrower than 32 bits"
 594 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 #  endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 # endif
 606 };
 607 #endif
 608
 609
 610 static void
 611 free_dfa_content (re_dfa_t *dfa)
 612 {
 613   Idx i, j;
 614
 615   if (dfa->nodes)
 616     for (i = 0; i < dfa->nodes_len; ++i)
 617       free_token (dfa->nodes + i);
 618   re_free (dfa->nexts);
 619   for (i = 0; i < dfa->nodes_len; ++i)
 620     {
 621       if (dfa->eclosures != NULL)
 622         re_node_set_free (dfa->eclosures + i);
 623       if (dfa->inveclosures != NULL)
 624         re_node_set_free (dfa->inveclosures + i);
 625       if (dfa->edests != NULL)
 626         re_node_set_free (dfa->edests + i);
 627     }
 628   re_free (dfa->edests);
 629   re_free (dfa->eclosures);
 630   re_free (dfa->inveclosures);
 631   re_free (dfa->nodes);
 632
 633   if (dfa->state_table)
 634     for (i = 0; i <= dfa->state_hash_mask; ++i)
 635       {
 636         struct re_state_table_entry *entry = dfa->state_table + i;
 637         for (j = 0; j < entry->num; ++j)
 638           {
 639             re_dfastate_t *state = entry->array[j];
 640             free_state (state);
 641           }
 642         re_free (entry->array);
 643       }
 644   re_free (dfa->state_table);
 645 #ifdef RE_ENABLE_I18N
 646   if (dfa->sb_char != utf8_sb_map)
 647     re_free (dfa->sb_char);
 648 #endif
 649   re_free (dfa->subexp_map);
 650 #ifdef DEBUG
 651   re_free (dfa->re_str);
 652 #endif
 653
 654   re_free (dfa);
 655 }
 656
 657
 658 /* Free dynamically allocated space used by PREG.  */
 659
 660 void
 661 regfree (preg)
 662     regex_t *preg;
 663 {
 664   re_dfa_t *dfa = preg->buffer;
 665   if (BE (dfa != NULL, 1))
 666     {
 667       lock_fini (dfa->lock);
 668       free_dfa_content (dfa);
 669     }
 670   preg->buffer = NULL;
 671   preg->allocated = 0;
 672
 673   re_free (preg->fastmap);
 674   preg->fastmap = NULL;
 675
 676   re_free (preg->translate);
 677   preg->translate = NULL;
 678 }
 679 #ifdef _LIBC
 680 weak_alias (__regfree, regfree)
 681 #endif
 682 \f
 683 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 684    them unless specifically requested.  */
 685
 686 #if defined _REGEX_RE_COMP || defined _LIBC
 687
 688 /* BSD has one and only one pattern buffer.  */
 689 static struct re_pattern_buffer re_comp_buf;
 690
 691 char *
 692 # ifdef _LIBC
 693 /* Make these definitions weak in libc, so POSIX programs can redefine
 694    these names if they don't use our functions, and still use
 695    regcomp/regexec above without link errors.  */
 696 weak_function
 697 # endif
 698 re_comp (s)
 699      const char *s;
 700 {
 701   reg_errcode_t ret;
 702   char *fastmap;
 703
 704   if (!s)
 705     {
 706       if (!re_comp_buf.buffer)
 707         return gettext ("No previous regular expression");
 708       return 0;
 709     }
 710
 711   if (re_comp_buf.buffer)
 712     {
 713       fastmap = re_comp_buf.fastmap;
 714       re_comp_buf.fastmap = NULL;
 715       __regfree (&re_comp_buf);
 716       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 717       re_comp_buf.fastmap = fastmap;
 718     }
 719
 720   if (re_comp_buf.fastmap == NULL)
 721     {
 722       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 723       if (re_comp_buf.fastmap == NULL)
 724         return (char *) gettext (__re_error_msgid
 725                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 726     }
 727
 728   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 729      don't need to initialize the pattern buffer fields which affect it.  */
 730
 731   /* Match anchors at newlines.  */
 732   re_comp_buf.newline_anchor = 1;
 733
 734   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 735
 736   if (!ret)
 737     return NULL;
 738
 739   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 740   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 741 }
 742
 743 #ifdef _LIBC
 744 libc_freeres_fn (free_mem)
 745 {
 746   __regfree (&re_comp_buf);
 747 }
 748 #endif
 749
 750 #endif /* _REGEX_RE_COMP */
 751 \f
 752 /* Internal entry point.
 753    Compile the regular expression PATTERN, whose length is LENGTH.
 754    SYNTAX indicate regular expression's syntax.  */
 755
 756 static reg_errcode_t
 757 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 758                      reg_syntax_t syntax)
 759 {
 760   reg_errcode_t err = REG_NOERROR;
 761   re_dfa_t *dfa;
 762   re_string_t regexp;
 763
 764   /* Initialize the pattern buffer.  */
 765   preg->fastmap_accurate = 0;
 766   preg->syntax = syntax;
 767   preg->not_bol = preg->not_eol = 0;
 768   preg->used = 0;
 769   preg->re_nsub = 0;
 770   preg->can_be_null = 0;
 771   preg->regs_allocated = REGS_UNALLOCATED;
 772
 773   /* Initialize the dfa.  */
 774   dfa = preg->buffer;
 775   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 776     {
 777       /* If zero allocated, but buffer is non-null, try to realloc
 778          enough space.  This loses if buffer's address is bogus, but
 779          that is the user's responsibility.  If ->buffer is NULL this
 780          is a simple allocation.  */
 781       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 782       if (dfa == NULL)
 783         return REG_ESPACE;
 784       preg->allocated = sizeof (re_dfa_t);
 785       preg->buffer = dfa;
 786     }
 787   preg->used = sizeof (re_dfa_t);
 788
 789   err = init_dfa (dfa, length);
 790   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 791     err = REG_ESPACE;
 792   if (BE (err != REG_NOERROR, 0))
 793     {
 794       free_dfa_content (dfa);
 795       preg->buffer = NULL;
 796       preg->allocated = 0;
 797       return err;
 798     }
 799 #ifdef DEBUG
 800   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 801   dfa->re_str = re_malloc (char, length + 1);
 802   strncpy (dfa->re_str, pattern, length + 1);
 803 #endif
 804
 805   err = re_string_construct (&regexp, pattern, length, preg->translate,
 806                              (syntax & RE_ICASE) != 0, dfa);
 807   if (BE (err != REG_NOERROR, 0))
 808     {
 809     re_compile_internal_free_return:
 810       free_workarea_compile (preg);
 811       re_string_destruct (&regexp);
 812       lock_fini (dfa->lock);
 813       free_dfa_content (dfa);
 814       preg->buffer = NULL;
 815       preg->allocated = 0;
 816       return err;
 817     }
 818
 819   /* Parse the regular expression, and build a structure tree.  */
 820   preg->re_nsub = 0;
 821   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 822   if (BE (dfa->str_tree == NULL, 0))
 823     goto re_compile_internal_free_return;
 824
 825   /* Analyze the tree and create the nfa.  */
 826   err = analyze (preg);
 827   if (BE (err != REG_NOERROR, 0))
 828     goto re_compile_internal_free_return;
 829
 830 #ifdef RE_ENABLE_I18N
 831   /* If possible, do searching in single byte encoding to speed things up.  */
 832   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 833     optimize_utf8 (dfa);
 834 #endif
 835
 836   /* Then create the initial state of the dfa.  */
 837   err = create_initial_state (dfa);
 838
 839   /* Release work areas.  */
 840   free_workarea_compile (preg);
 841   re_string_destruct (&regexp);
 842
 843   if (BE (err != REG_NOERROR, 0))
 844     {
 845       lock_fini (dfa->lock);
 846       free_dfa_content (dfa);
 847       preg->buffer = NULL;
 848       preg->allocated = 0;
 849     }
 850
 851   return err;
 852 }
 853
 854 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 855    as the initial length of some arrays.  */
 856
 857 static reg_errcode_t
 858 init_dfa (re_dfa_t *dfa, size_t pat_len)
 859 {
 860   __re_size_t table_size;
 861 #ifndef _LIBC
 862   const char *codeset_name;
 863 #endif
 864 #ifdef RE_ENABLE_I18N
 865   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 866 #else
 867   size_t max_i18n_object_size = 0;
 868 #endif
 869   size_t max_object_size =
 870     MAX (sizeof (struct re_state_table_entry),
 871          MAX (sizeof (re_token_t),
 872               MAX (sizeof (re_node_set),
 873                    MAX (sizeof (regmatch_t),
 874                         max_i18n_object_size))));
 875
 876   memset (dfa, '\0', sizeof (re_dfa_t));
 877
 878   /* Force allocation of str_tree_storage the first time.  */
 879   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 880
 881   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 882      calculation below, and for similar doubling calculations
 883      elsewhere.  And it's <= rather than <, because some of the
 884      doubling calculations add 1 afterwards.  */
 885   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 886     return REG_ESPACE;
 887
 888   dfa->nodes_alloc = pat_len + 1;
 889   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 890
 891   /*  table_size = 2 ^ ceil(log pat_len) */
 892   for (table_size = 1; ; table_size <<= 1)
 893     if (table_size > pat_len)
 894       break;
 895
 896   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 897   dfa->state_hash_mask = table_size - 1;
 898
 899   dfa->mb_cur_max = MB_CUR_MAX;
 900 #ifdef _LIBC
 901   if (dfa->mb_cur_max == 6
 902       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 903     dfa->is_utf8 = 1;
 904   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 905                        != 0);
 906 #else
 907   codeset_name = nl_langinfo (CODESET);
 908   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 909       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 910       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 911       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 912     dfa->is_utf8 = 1;
 913
 914   /* We check exhaustively in the loop below if this charset is a
 915      superset of ASCII.  */
 916   dfa->map_notascii = 0;
 917 #endif
 918
 919 #ifdef RE_ENABLE_I18N
 920   if (dfa->mb_cur_max > 1)
 921     {
 922       if (dfa->is_utf8)
 923         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 924       else
 925         {
 926           int i, j, ch;
 927
 928           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 929           if (BE (dfa->sb_char == NULL, 0))
 930             return REG_ESPACE;
 931
 932           /* Set the bits corresponding to single byte chars.  */
 933           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 934             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 935               {
 936                 wint_t wch = __btowc (ch);
 937                 if (wch != WEOF)
 938                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 939 # ifndef _LIBC
 940                 if (isascii (ch) && wch != ch)
 941                   dfa->map_notascii = 1;
 942 # endif
 943               }
 944         }
 945     }
 946 #endif
 947
 948   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 949     return REG_ESPACE;
 950   return REG_NOERROR;
 951 }
 952
 953 /* Initialize WORD_CHAR table, which indicate which character is
 954    "word".  In this case "word" means that it is the word construction
 955    character used by some operators like "\<", "\>", etc.  */
 956
 957 static void
 958 internal_function
 959 init_word_char (re_dfa_t *dfa)
 960 {
 961   int i = 0;
 962   int j;
 963   int ch = 0;
 964   dfa->word_ops_used = 1;
 965   if (BE (dfa->map_notascii == 0, 1))
 966     {
 967       bitset_word_t bits0 = 0x00000000;
 968       bitset_word_t bits1 = 0x03ff0000;
 969       bitset_word_t bits2 = 0x87fffffe;
 970       bitset_word_t bits3 = 0x07fffffe;
 971       if (BITSET_WORD_BITS == 64)
 972         {
 973           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 974           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 975           i = 2;
 976         }
 977       else if (BITSET_WORD_BITS == 32)
 978         {
 979           dfa->word_char[0] = bits0;
 980           dfa->word_char[1] = bits1;
 981           dfa->word_char[2] = bits2;
 982           dfa->word_char[3] = bits3;
 983           i = 4;
 984         }
 985       else
 986         goto general_case;
 987       ch = 128;
 988
 989       if (BE (dfa->is_utf8, 1))
 990         {
 991           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 992           return;
 993         }
 994     }
 995
 996  general_case:
 997   for (; i < BITSET_WORDS; ++i)
 998     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 999       if (isalnum (ch) || ch == '_')
1000         dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001 }
1002
1003 /* Free the work area which are only used while compiling.  */
1004
1005 static void
1006 free_workarea_compile (regex_t *preg)
1007 {
1008   re_dfa_t *dfa = preg->buffer;
1009   bin_tree_storage_t *storage, *next;
1010   for (storage = dfa->str_tree_storage; storage; storage = next)
1011     {
1012       next = storage->next;
1013       re_free (storage);
1014     }
1015   dfa->str_tree_storage = NULL;
1016   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017   dfa->str_tree = NULL;
1018   re_free (dfa->org_indices);
1019   dfa->org_indices = NULL;
1020 }
1021
1022 /* Create initial states for all contexts.  */
1023
1024 static reg_errcode_t
1025 create_initial_state (re_dfa_t *dfa)
1026 {
1027   Idx first, i;
1028   reg_errcode_t err;
1029   re_node_set init_nodes;
1030
1031   /* Initial states have the epsilon closure of the node which is
1032      the first node of the regular expression.  */
1033   first = dfa->str_tree->first->node_idx;
1034   dfa->init_node = first;
1035   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036   if (BE (err != REG_NOERROR, 0))
1037     return err;
1038
1039   /* The back-references which are in initial states can epsilon transit,
1040      since in this case all of the subexpressions can be null.
1041      Then we add epsilon closures of the nodes which are the next nodes of
1042      the back-references.  */
1043   if (dfa->nbackref > 0)
1044     for (i = 0; i < init_nodes.nelem; ++i)
1045       {
1046         Idx node_idx = init_nodes.elems[i];
1047         re_token_type_t type = dfa->nodes[node_idx].type;
1048
1049         Idx clexp_idx;
1050         if (type != OP_BACK_REF)
1051           continue;
1052         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053           {
1054             re_token_t *clexp_node;
1055             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056             if (clexp_node->type == OP_CLOSE_SUBEXP
1057                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058               break;
1059           }
1060         if (clexp_idx == init_nodes.nelem)
1061           continue;
1062
1063         if (type == OP_BACK_REF)
1064           {
1065             Idx dest_idx = dfa->edests[node_idx].elems[0];
1066             if (!re_node_set_contains (&init_nodes, dest_idx))
1067               {
1068                 reg_errcode_t merge_err
1069                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070                 if (merge_err != REG_NOERROR)
1071                   return merge_err;
1072                 i = 0;
1073               }
1074           }
1075       }
1076
1077   /* It must be the first time to invoke acquire_state.  */
1078   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079   /* We don't check ERR here, since the initial state must not be NULL.  */
1080   if (BE (dfa->init_state == NULL, 0))
1081     return err;
1082   if (dfa->init_state->has_constraint)
1083     {
1084       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085                                                        CONTEXT_WORD);
1086       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087                                                      CONTEXT_NEWLINE);
1088       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089                                                          &init_nodes,
1090                                                          CONTEXT_NEWLINE
1091                                                          | CONTEXT_BEGBUF);
1092       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093               || dfa->init_state_begbuf == NULL, 0))
1094         return err;
1095     }
1096   else
1097     dfa->init_state_word = dfa->init_state_nl
1098       = dfa->init_state_begbuf = dfa->init_state;
1099
1100   re_node_set_free (&init_nodes);
1101   return REG_NOERROR;
1102 }
1103 \f
1104 #ifdef RE_ENABLE_I18N
1105 /* If it is possible to do searching in single byte encoding instead of UTF-8
1106    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107    DFA nodes where needed.  */
1108
1109 static void
1110 optimize_utf8 (re_dfa_t *dfa)
1111 {
1112   Idx node;
1113   int i;
1114   bool mb_chars = false;
1115   bool has_period = false;
1116
1117   for (node = 0; node < dfa->nodes_len; ++node)
1118     switch (dfa->nodes[node].type)
1119       {
1120       case CHARACTER:
1121         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122           mb_chars = true;
1123         break;
1124       case ANCHOR:
1125         switch (dfa->nodes[node].opr.ctx_type)
1126           {
1127           case LINE_FIRST:
1128           case LINE_LAST:
1129           case BUF_FIRST:
1130           case BUF_LAST:
1131             break;
1132           default:
1133             /* Word anchors etc. cannot be handled.  It's okay to test
1134                opr.ctx_type since constraints (for all DFA nodes) are
1135                created by ORing one or more opr.ctx_type values.  */
1136             return;
1137           }
1138         break;
1139       case OP_PERIOD:
1140         has_period = true;
1141         break;
1142       case OP_BACK_REF:
1143       case OP_ALT:
1144       case END_OF_RE:
1145       case OP_DUP_ASTERISK:
1146       case OP_OPEN_SUBEXP:
1147       case OP_CLOSE_SUBEXP:
1148         break;
1149       case COMPLEX_BRACKET:
1150         return;
1151       case SIMPLE_BRACKET:
1152         /* Just double check.  */
1153         {
1154           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155                         ? 0
1156                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158             {
1159               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160                 return;
1161               rshift = 0;
1162             }
1163         }
1164         break;
1165       default:
1166         abort ();
1167       }
1168
1169   if (mb_chars || has_period)
1170     for (node = 0; node < dfa->nodes_len; ++node)
1171       {
1172         if (dfa->nodes[node].type == CHARACTER
1173             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174           dfa->nodes[node].mb_partial = 0;
1175         else if (dfa->nodes[node].type == OP_PERIOD)
1176           dfa->nodes[node].type = OP_UTF8_PERIOD;
1177       }
1178
1179   /* The search can be in single byte locale.  */
1180   dfa->mb_cur_max = 1;
1181   dfa->is_utf8 = 0;
1182   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183 }
1184 #endif
1185 \f
1186 /* Analyze the structure tree, and calculate "first", "next", "edest",
1187    "eclosure", and "inveclosure".  */
1188
1189 static reg_errcode_t
1190 analyze (regex_t *preg)
1191 {
1192   re_dfa_t *dfa = preg->buffer;
1193   reg_errcode_t ret;
1194
1195   /* Allocate arrays.  */
1196   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201           || dfa->eclosures == NULL, 0))
1202     return REG_ESPACE;
1203
1204   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205   if (dfa->subexp_map != NULL)
1206     {
1207       Idx i;
1208       for (i = 0; i < preg->re_nsub; i++)
1209         dfa->subexp_map[i] = i;
1210       preorder (dfa->str_tree, optimize_subexps, dfa);
1211       for (i = 0; i < preg->re_nsub; i++)
1212         if (dfa->subexp_map[i] != i)
1213           break;
1214       if (i == preg->re_nsub)
1215         {
1216           free (dfa->subexp_map);
1217           dfa->subexp_map = NULL;
1218         }
1219     }
1220
1221   ret = postorder (dfa->str_tree, lower_subexps, preg);
1222   if (BE (ret != REG_NOERROR, 0))
1223     return ret;
1224   ret = postorder (dfa->str_tree, calc_first, dfa);
1225   if (BE (ret != REG_NOERROR, 0))
1226     return ret;
1227   preorder (dfa->str_tree, calc_next, dfa);
1228   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229   if (BE (ret != REG_NOERROR, 0))
1230     return ret;
1231   ret = calc_eclosure (dfa);
1232   if (BE (ret != REG_NOERROR, 0))
1233     return ret;
1234
1235   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1237   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238       || dfa->nbackref)
1239     {
1240       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241       if (BE (dfa->inveclosures == NULL, 0))
1242         return REG_ESPACE;
1243       ret = calc_inveclosure (dfa);
1244     }
1245
1246   return ret;
1247 }
1248
1249 /* Our parse trees are very unbalanced, so we cannot use a stack to
1250    implement parse tree visits.  Instead, we use parent pointers and
1251    some hairy code in these two functions.  */
1252 static reg_errcode_t
1253 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254            void *extra)
1255 {
1256   bin_tree_t *node, *prev;
1257
1258   for (node = root; ; )
1259     {
1260       /* Descend down the tree, preferably to the left (or to the right
1261          if that's the only child).  */
1262       while (node->left || node->right)
1263         if (node->left)
1264           node = node->left;
1265         else
1266           node = node->right;
1267
1268       do
1269         {
1270           reg_errcode_t err = fn (extra, node);
1271           if (BE (err != REG_NOERROR, 0))
1272             return err;
1273           if (node->parent == NULL)
1274             return REG_NOERROR;
1275           prev = node;
1276           node = node->parent;
1277         }
1278       /* Go up while we have a node that is reached from the right.  */
1279       while (node->right == prev || node->right == NULL);
1280       node = node->right;
1281     }
1282 }
1283
1284 static reg_errcode_t
1285 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286           void *extra)
1287 {
1288   bin_tree_t *node;
1289
1290   for (node = root; ; )
1291     {
1292       reg_errcode_t err = fn (extra, node);
1293       if (BE (err != REG_NOERROR, 0))
1294         return err;
1295
1296       /* Go to the left node, or up and to the right.  */
1297       if (node->left)
1298         node = node->left;
1299       else
1300         {
1301           bin_tree_t *prev = NULL;
1302           while (node->right == prev || node->right == NULL)
1303             {
1304               prev = node;
1305               node = node->parent;
1306               if (!node)
1307                 return REG_NOERROR;
1308             }
1309           node = node->right;
1310         }
1311     }
1312 }
1313
1314 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1316    backreferences as well.  Requires a preorder visit.  */
1317 static reg_errcode_t
1318 optimize_subexps (void *extra, bin_tree_t *node)
1319 {
1320   re_dfa_t *dfa = (re_dfa_t *) extra;
1321
1322   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323     {
1324       int idx = node->token.opr.idx;
1325       node->token.opr.idx = dfa->subexp_map[idx];
1326       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327     }
1328
1329   else if (node->token.type == SUBEXP
1330            && node->left && node->left->token.type == SUBEXP)
1331     {
1332       Idx other_idx = node->left->token.opr.idx;
1333
1334       node->left = node->left->left;
1335       if (node->left)
1336         node->left->parent = node;
1337
1338       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339       if (other_idx < BITSET_WORD_BITS)
1340         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341     }
1342
1343   return REG_NOERROR;
1344 }
1345
1346 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1348 static reg_errcode_t
1349 lower_subexps (void *extra, bin_tree_t *node)
1350 {
1351   regex_t *preg = (regex_t *) extra;
1352   reg_errcode_t err = REG_NOERROR;
1353
1354   if (node->left && node->left->token.type == SUBEXP)
1355     {
1356       node->left = lower_subexp (&err, preg, node->left);
1357       if (node->left)
1358         node->left->parent = node;
1359     }
1360   if (node->right && node->right->token.type == SUBEXP)
1361     {
1362       node->right = lower_subexp (&err, preg, node->right);
1363       if (node->right)
1364         node->right->parent = node;
1365     }
1366
1367   return err;
1368 }
1369
1370 static bin_tree_t *
1371 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372 {
1373   re_dfa_t *dfa = preg->buffer;
1374   bin_tree_t *body = node->left;
1375   bin_tree_t *op, *cls, *tree1, *tree;
1376
1377   if (preg->no_sub
1378       /* We do not optimize empty subexpressions, because otherwise we may
1379          have bad CONCAT nodes with NULL children.  This is obviously not
1380          very common, so we do not lose much.  An example that triggers
1381          this case is the sed "script" /\(\)/x.  */
1382       && node->left != NULL
1383       && (node->token.opr.idx >= BITSET_WORD_BITS
1384           || !(dfa->used_bkref_map
1385                & ((bitset_word_t) 1 << node->token.opr.idx))))
1386     return node->left;
1387
1388   /* Convert the SUBEXP node to the concatenation of an
1389      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1390   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393   tree = create_tree (dfa, op, tree1, CONCAT);
1394   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395     {
1396       *err = REG_ESPACE;
1397       return NULL;
1398     }
1399
1400   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402   return tree;
1403 }
1404
1405 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406    nodes.  Requires a postorder visit.  */
1407 static reg_errcode_t
1408 calc_first (void *extra, bin_tree_t *node)
1409 {
1410   re_dfa_t *dfa = (re_dfa_t *) extra;
1411   if (node->token.type == CONCAT)
1412     {
1413       node->first = node->left->first;
1414       node->node_idx = node->left->node_idx;
1415     }
1416   else
1417     {
1418       node->first = node;
1419       node->node_idx = re_dfa_add_node (dfa, node->token);
1420       if (BE (node->node_idx == REG_MISSING, 0))
1421         return REG_ESPACE;
1422       if (node->token.type == ANCHOR)
1423         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1424     }
1425   return REG_NOERROR;
1426 }
1427
1428 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1429 static reg_errcode_t
1430 calc_next (void *extra, bin_tree_t *node)
1431 {
1432   switch (node->token.type)
1433     {
1434     case OP_DUP_ASTERISK:
1435       node->left->next = node;
1436       break;
1437     case CONCAT:
1438       node->left->next = node->right->first;
1439       node->right->next = node->next;
1440       break;
1441     default:
1442       if (node->left)
1443         node->left->next = node->next;
1444       if (node->right)
1445         node->right->next = node->next;
1446       break;
1447     }
1448   return REG_NOERROR;
1449 }
1450
1451 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1452 static reg_errcode_t
1453 link_nfa_nodes (void *extra, bin_tree_t *node)
1454 {
1455   re_dfa_t *dfa = (re_dfa_t *) extra;
1456   Idx idx = node->node_idx;
1457   reg_errcode_t err = REG_NOERROR;
1458
1459   switch (node->token.type)
1460     {
1461     case CONCAT:
1462       break;
1463
1464     case END_OF_RE:
1465       assert (node->next == NULL);
1466       break;
1467
1468     case OP_DUP_ASTERISK:
1469     case OP_ALT:
1470       {
1471         Idx left, right;
1472         dfa->has_plural_match = 1;
1473         if (node->left != NULL)
1474           left = node->left->first->node_idx;
1475         else
1476           left = node->next->node_idx;
1477         if (node->right != NULL)
1478           right = node->right->first->node_idx;
1479         else
1480           right = node->next->node_idx;
1481         assert (REG_VALID_INDEX (left));
1482         assert (REG_VALID_INDEX (right));
1483         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484       }
1485       break;
1486
1487     case ANCHOR:
1488     case OP_OPEN_SUBEXP:
1489     case OP_CLOSE_SUBEXP:
1490       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491       break;
1492
1493     case OP_BACK_REF:
1494       dfa->nexts[idx] = node->next->node_idx;
1495       if (node->token.type == OP_BACK_REF)
1496         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1497       break;
1498
1499     default:
1500       assert (!IS_EPSILON_NODE (node->token.type));
1501       dfa->nexts[idx] = node->next->node_idx;
1502       break;
1503     }
1504
1505   return err;
1506 }
1507
1508 /* Duplicate the epsilon closure of the node ROOT_NODE.
1509    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510    to their own constraint.  */
1511
1512 static reg_errcode_t
1513 internal_function
1514 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515                         Idx root_node, unsigned int init_constraint)
1516 {
1517   Idx org_node, clone_node;
1518   bool ok;
1519   unsigned int constraint = init_constraint;
1520   for (org_node = top_org_node, clone_node = top_clone_node;;)
1521     {
1522       Idx org_dest, clone_dest;
1523       if (dfa->nodes[org_node].type == OP_BACK_REF)
1524         {
1525           /* If the back reference epsilon-transit, its destination must
1526              also have the constraint.  Then duplicate the epsilon closure
1527              of the destination of the back reference, and store it in
1528              edests of the back reference.  */
1529           org_dest = dfa->nexts[org_node];
1530           re_node_set_empty (dfa->edests + clone_node);
1531           clone_dest = duplicate_node (dfa, org_dest, constraint);
1532           if (BE (clone_dest == REG_MISSING, 0))
1533             return REG_ESPACE;
1534           dfa->nexts[clone_node] = dfa->nexts[org_node];
1535           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536           if (BE (! ok, 0))
1537             return REG_ESPACE;
1538         }
1539       else if (dfa->edests[org_node].nelem == 0)
1540         {
1541           /* In case of the node can't epsilon-transit, don't duplicate the
1542              destination and store the original destination as the
1543              destination of the node.  */
1544           dfa->nexts[clone_node] = dfa->nexts[org_node];
1545           break;
1546         }
1547       else if (dfa->edests[org_node].nelem == 1)
1548         {
1549           /* In case of the node can epsilon-transit, and it has only one
1550              destination.  */
1551           org_dest = dfa->edests[org_node].elems[0];
1552           re_node_set_empty (dfa->edests + clone_node);
1553           /* If the node is root_node itself, it means the epsilon closure
1554              has a loop.  Then tie it to the destination of the root_node.  */
1555           if (org_node == root_node && clone_node != org_node)
1556             {
1557               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558               if (BE (! ok, 0))
1559                 return REG_ESPACE;
1560               break;
1561             }
1562           /* In case the node has another constraint, append it.  */
1563           constraint |= dfa->nodes[org_node].constraint;
1564           clone_dest = duplicate_node (dfa, org_dest, constraint);
1565           if (BE (clone_dest == REG_MISSING, 0))
1566             return REG_ESPACE;
1567           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568           if (BE (! ok, 0))
1569             return REG_ESPACE;
1570         }
1571       else /* dfa->edests[org_node].nelem == 2 */
1572         {
1573           /* In case of the node can epsilon-transit, and it has two
1574              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1575           org_dest = dfa->edests[org_node].elems[0];
1576           re_node_set_empty (dfa->edests + clone_node);
1577           /* Search for a duplicated node which satisfies the constraint.  */
1578           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579           if (clone_dest == REG_MISSING)
1580             {
1581               /* There is no such duplicated node, create a new one.  */
1582               reg_errcode_t err;
1583               clone_dest = duplicate_node (dfa, org_dest, constraint);
1584               if (BE (clone_dest == REG_MISSING, 0))
1585                 return REG_ESPACE;
1586               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587               if (BE (! ok, 0))
1588                 return REG_ESPACE;
1589               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590                                             root_node, constraint);
1591               if (BE (err != REG_NOERROR, 0))
1592                 return err;
1593             }
1594           else
1595             {
1596               /* There is a duplicated node which satisfies the constraint,
1597                  use it to avoid infinite loop.  */
1598               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599               if (BE (! ok, 0))
1600                 return REG_ESPACE;
1601             }
1602
1603           org_dest = dfa->edests[org_node].elems[1];
1604           clone_dest = duplicate_node (dfa, org_dest, constraint);
1605           if (BE (clone_dest == REG_MISSING, 0))
1606             return REG_ESPACE;
1607           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608           if (BE (! ok, 0))
1609             return REG_ESPACE;
1610         }
1611       org_node = org_dest;
1612       clone_node = clone_dest;
1613     }
1614   return REG_NOERROR;
1615 }
1616
1617 /* Search for a node which is duplicated from the node ORG_NODE, and
1618    satisfies the constraint CONSTRAINT.  */
1619
1620 static Idx
1621 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622                         unsigned int constraint)
1623 {
1624   Idx idx;
1625   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626     {
1627       if (org_node == dfa->org_indices[idx]
1628           && constraint == dfa->nodes[idx].constraint)
1629         return idx; /* Found.  */
1630     }
1631   return REG_MISSING; /* Not found.  */
1632 }
1633
1634 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635    Return the index of the new node, or REG_MISSING if insufficient storage is
1636    available.  */
1637
1638 static Idx
1639 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640 {
1641   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642   if (BE (dup_idx != REG_MISSING, 1))
1643     {
1644       dfa->nodes[dup_idx].constraint = constraint;
1645       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646       dfa->nodes[dup_idx].duplicated = 1;
1647
1648       /* Store the index of the original node.  */
1649       dfa->org_indices[dup_idx] = org_idx;
1650     }
1651   return dup_idx;
1652 }
1653
1654 static reg_errcode_t
1655 calc_inveclosure (re_dfa_t *dfa)
1656 {
1657   Idx src, idx;
1658   bool ok;
1659   for (idx = 0; idx < dfa->nodes_len; ++idx)
1660     re_node_set_init_empty (dfa->inveclosures + idx);
1661
1662   for (src = 0; src < dfa->nodes_len; ++src)
1663     {
1664       Idx *elems = dfa->eclosures[src].elems;
1665       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666         {
1667           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668           if (BE (! ok, 0))
1669             return REG_ESPACE;
1670         }
1671     }
1672
1673   return REG_NOERROR;
1674 }
1675
1676 /* Calculate "eclosure" for all the node in DFA.  */
1677
1678 static reg_errcode_t
1679 calc_eclosure (re_dfa_t *dfa)
1680 {
1681   Idx node_idx;
1682   bool incomplete;
1683 #ifdef DEBUG
1684   assert (dfa->nodes_len > 0);
1685 #endif
1686   incomplete = false;
1687   /* For each nodes, calculate epsilon closure.  */
1688   for (node_idx = 0; ; ++node_idx)
1689     {
1690       reg_errcode_t err;
1691       re_node_set eclosure_elem;
1692       if (node_idx == dfa->nodes_len)
1693         {
1694           if (!incomplete)
1695             break;
1696           incomplete = false;
1697           node_idx = 0;
1698         }
1699
1700 #ifdef DEBUG
1701       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702 #endif
1703
1704       /* If we have already calculated, skip it.  */
1705       if (dfa->eclosures[node_idx].nelem != 0)
1706         continue;
1707       /* Calculate epsilon closure of 'node_idx'.  */
1708       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709       if (BE (err != REG_NOERROR, 0))
1710         return err;
1711
1712       if (dfa->eclosures[node_idx].nelem == 0)
1713         {
1714           incomplete = true;
1715           re_node_set_free (&eclosure_elem);
1716         }
1717     }
1718   return REG_NOERROR;
1719 }
1720
1721 /* Calculate epsilon closure of NODE.  */
1722
1723 static reg_errcode_t
1724 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725 {
1726   reg_errcode_t err;
1727   Idx i;
1728   re_node_set eclosure;
1729   bool ok;
1730   bool incomplete = false;
1731   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732   if (BE (err != REG_NOERROR, 0))
1733     return err;
1734
1735   /* This indicates that we are calculating this node now.
1736      We reference this value to avoid infinite loop.  */
1737   dfa->eclosures[node].nelem = REG_MISSING;
1738
1739   /* If the current node has constraints, duplicate all nodes
1740      since they must inherit the constraints.  */
1741   if (dfa->nodes[node].constraint
1742       && dfa->edests[node].nelem
1743       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744     {
1745       err = duplicate_node_closure (dfa, node, node, node,
1746                                     dfa->nodes[node].constraint);
1747       if (BE (err != REG_NOERROR, 0))
1748         return err;
1749     }
1750
1751   /* Expand each epsilon destination nodes.  */
1752   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753     for (i = 0; i < dfa->edests[node].nelem; ++i)
1754       {
1755         re_node_set eclosure_elem;
1756         Idx edest = dfa->edests[node].elems[i];
1757         /* If calculating the epsilon closure of 'edest' is in progress,
1758            return intermediate result.  */
1759         if (dfa->eclosures[edest].nelem == REG_MISSING)
1760           {
1761             incomplete = true;
1762             continue;
1763           }
1764         /* If we haven't calculated the epsilon closure of 'edest' yet,
1765            calculate now. Otherwise use calculated epsilon closure.  */
1766         if (dfa->eclosures[edest].nelem == 0)
1767           {
1768             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769             if (BE (err != REG_NOERROR, 0))
1770               return err;
1771           }
1772         else
1773           eclosure_elem = dfa->eclosures[edest];
1774         /* Merge the epsilon closure of 'edest'.  */
1775         err = re_node_set_merge (&eclosure, &eclosure_elem);
1776         if (BE (err != REG_NOERROR, 0))
1777           return err;
1778         /* If the epsilon closure of 'edest' is incomplete,
1779            the epsilon closure of this node is also incomplete.  */
1780         if (dfa->eclosures[edest].nelem == 0)
1781           {
1782             incomplete = true;
1783             re_node_set_free (&eclosure_elem);
1784           }
1785       }
1786
1787   /* An epsilon closure includes itself.  */
1788   ok = re_node_set_insert (&eclosure, node);
1789   if (BE (! ok, 0))
1790     return REG_ESPACE;
1791   if (incomplete && !root)
1792     dfa->eclosures[node].nelem = 0;
1793   else
1794     dfa->eclosures[node] = eclosure;
1795   *new_set = eclosure;
1796   return REG_NOERROR;
1797 }
1798 \f
1799 /* Functions for token which are used in the parser.  */
1800
1801 /* Fetch a token from INPUT.
1802    We must not use this function inside bracket expressions.  */
1803
1804 static void
1805 internal_function
1806 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807 {
1808   re_string_skip_bytes (input, peek_token (result, input, syntax));
1809 }
1810
1811 /* Peek a token from INPUT, and return the length of the token.
1812    We must not use this function inside bracket expressions.  */
1813
1814 static int
1815 internal_function
1816 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817 {
1818   unsigned char c;
1819
1820   if (re_string_eoi (input))
1821     {
1822       token->type = END_OF_RE;
1823       return 0;
1824     }
1825
1826   c = re_string_peek_byte (input, 0);
1827   token->opr.c = c;
1828
1829   token->word_char = 0;
1830 #ifdef RE_ENABLE_I18N
1831   token->mb_partial = 0;
1832   if (input->mb_cur_max > 1 &&
1833       !re_string_first_byte (input, re_string_cur_idx (input)))
1834     {
1835       token->type = CHARACTER;
1836       token->mb_partial = 1;
1837       return 1;
1838     }
1839 #endif
1840   if (c == '\\')
1841     {
1842       unsigned char c2;
1843       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844         {
1845           token->type = BACK_SLASH;
1846           return 1;
1847         }
1848
1849       c2 = re_string_peek_byte_case (input, 1);
1850       token->opr.c = c2;
1851       token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853       if (input->mb_cur_max > 1)
1854         {
1855           wint_t wc = re_string_wchar_at (input,
1856                                           re_string_cur_idx (input) + 1);
1857           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858         }
1859       else
1860 #endif
1861         token->word_char = IS_WORD_CHAR (c2) != 0;
1862
1863       switch (c2)
1864         {
1865         case '|':
1866           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867             token->type = OP_ALT;
1868           break;
1869         case '1': case '2': case '3': case '4': case '5':
1870         case '6': case '7': case '8': case '9':
1871           if (!(syntax & RE_NO_BK_REFS))
1872             {
1873               token->type = OP_BACK_REF;
1874               token->opr.idx = c2 - '1';
1875             }
1876           break;
1877         case '<':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = WORD_FIRST;
1882             }
1883           break;
1884         case '>':
1885           if (!(syntax & RE_NO_GNU_OPS))
1886             {
1887               token->type = ANCHOR;
1888               token->opr.ctx_type = WORD_LAST;
1889             }
1890           break;
1891         case 'b':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = WORD_DELIM;
1896             }
1897           break;
1898         case 'B':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             {
1901               token->type = ANCHOR;
1902               token->opr.ctx_type = NOT_WORD_DELIM;
1903             }
1904           break;
1905         case 'w':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_WORD;
1908           break;
1909         case 'W':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             token->type = OP_NOTWORD;
1912           break;
1913         case 's':
1914           if (!(syntax & RE_NO_GNU_OPS))
1915             token->type = OP_SPACE;
1916           break;
1917         case 'S':
1918           if (!(syntax & RE_NO_GNU_OPS))
1919             token->type = OP_NOTSPACE;
1920           break;
1921         case '`':
1922           if (!(syntax & RE_NO_GNU_OPS))
1923             {
1924               token->type = ANCHOR;
1925               token->opr.ctx_type = BUF_FIRST;
1926             }
1927           break;
1928         case '\'':
1929           if (!(syntax & RE_NO_GNU_OPS))
1930             {
1931               token->type = ANCHOR;
1932               token->opr.ctx_type = BUF_LAST;
1933             }
1934           break;
1935         case '(':
1936           if (!(syntax & RE_NO_BK_PARENS))
1937             token->type = OP_OPEN_SUBEXP;
1938           break;
1939         case ')':
1940           if (!(syntax & RE_NO_BK_PARENS))
1941             token->type = OP_CLOSE_SUBEXP;
1942           break;
1943         case '+':
1944           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945             token->type = OP_DUP_PLUS;
1946           break;
1947         case '?':
1948           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949             token->type = OP_DUP_QUESTION;
1950           break;
1951         case '{':
1952           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953             token->type = OP_OPEN_DUP_NUM;
1954           break;
1955         case '}':
1956           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957             token->type = OP_CLOSE_DUP_NUM;
1958           break;
1959         default:
1960           break;
1961         }
1962       return 2;
1963     }
1964
1965   token->type = CHARACTER;
1966 #ifdef RE_ENABLE_I18N
1967   if (input->mb_cur_max > 1)
1968     {
1969       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971     }
1972   else
1973 #endif
1974     token->word_char = IS_WORD_CHAR (token->opr.c);
1975
1976   switch (c)
1977     {
1978     case '\n':
1979       if (syntax & RE_NEWLINE_ALT)
1980         token->type = OP_ALT;
1981       break;
1982     case '|':
1983       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984         token->type = OP_ALT;
1985       break;
1986     case '*':
1987       token->type = OP_DUP_ASTERISK;
1988       break;
1989     case '+':
1990       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991         token->type = OP_DUP_PLUS;
1992       break;
1993     case '?':
1994       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995         token->type = OP_DUP_QUESTION;
1996       break;
1997     case '{':
1998       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999         token->type = OP_OPEN_DUP_NUM;
2000       break;
2001     case '}':
2002       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003         token->type = OP_CLOSE_DUP_NUM;
2004       break;
2005     case '(':
2006       if (syntax & RE_NO_BK_PARENS)
2007         token->type = OP_OPEN_SUBEXP;
2008       break;
2009     case ')':
2010       if (syntax & RE_NO_BK_PARENS)
2011         token->type = OP_CLOSE_SUBEXP;
2012       break;
2013     case '[':
2014       token->type = OP_OPEN_BRACKET;
2015       break;
2016     case '.':
2017       token->type = OP_PERIOD;
2018       break;
2019     case '^':
2020       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021           re_string_cur_idx (input) != 0)
2022         {
2023           char prev = re_string_peek_byte (input, -1);
2024           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025             break;
2026         }
2027       token->type = ANCHOR;
2028       token->opr.ctx_type = LINE_FIRST;
2029       break;
2030     case '$':
2031       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032           re_string_cur_idx (input) + 1 != re_string_length (input))
2033         {
2034           re_token_t next;
2035           re_string_skip_bytes (input, 1);
2036           peek_token (&next, input, syntax);
2037           re_string_skip_bytes (input, -1);
2038           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039             break;
2040         }
2041       token->type = ANCHOR;
2042       token->opr.ctx_type = LINE_LAST;
2043       break;
2044     default:
2045       break;
2046     }
2047   return 1;
2048 }
2049
2050 /* Peek a token from INPUT, and return the length of the token.
2051    We must not use this function out of bracket expressions.  */
2052
2053 static int
2054 internal_function
2055 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056 {
2057   unsigned char c;
2058   if (re_string_eoi (input))
2059     {
2060       token->type = END_OF_RE;
2061       return 0;
2062     }
2063   c = re_string_peek_byte (input, 0);
2064   token->opr.c = c;
2065
2066 #ifdef RE_ENABLE_I18N
2067   if (input->mb_cur_max > 1 &&
2068       !re_string_first_byte (input, re_string_cur_idx (input)))
2069     {
2070       token->type = CHARACTER;
2071       return 1;
2072     }
2073 #endif /* RE_ENABLE_I18N */
2074
2075   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076       && re_string_cur_idx (input) + 1 < re_string_length (input))
2077     {
2078       /* In this case, '\' escape a character.  */
2079       unsigned char c2;
2080       re_string_skip_bytes (input, 1);
2081       c2 = re_string_peek_byte (input, 0);
2082       token->opr.c = c2;
2083       token->type = CHARACTER;
2084       return 1;
2085     }
2086   if (c == '[') /* '[' is a special char in a bracket exps.  */
2087     {
2088       unsigned char c2;
2089       int token_len;
2090       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091         c2 = re_string_peek_byte (input, 1);
2092       else
2093         c2 = 0;
2094       token->opr.c = c2;
2095       token_len = 2;
2096       switch (c2)
2097         {
2098         case '.':
2099           token->type = OP_OPEN_COLL_ELEM;
2100           break;
2101         case '=':
2102           token->type = OP_OPEN_EQUIV_CLASS;
2103           break;
2104         case ':':
2105           if (syntax & RE_CHAR_CLASSES)
2106             {
2107               token->type = OP_OPEN_CHAR_CLASS;
2108               break;
2109             }
2110           /* else fall through.  */
2111         default:
2112           token->type = CHARACTER;
2113           token->opr.c = c;
2114           token_len = 1;
2115           break;
2116         }
2117       return token_len;
2118     }
2119   switch (c)
2120     {
2121     case '-':
2122       token->type = OP_CHARSET_RANGE;
2123       break;
2124     case ']':
2125       token->type = OP_CLOSE_BRACKET;
2126       break;
2127     case '^':
2128       token->type = OP_NON_MATCH_LIST;
2129       break;
2130     default:
2131       token->type = CHARACTER;
2132     }
2133   return 1;
2134 }
2135 \f
2136 /* Functions for parser.  */
2137
2138 /* Entry point of the parser.
2139    Parse the regular expression REGEXP and return the structure tree.
2140    If an error occurs, ERR is set by error code, and return NULL.
2141    This function build the following tree, from regular expression <reg_exp>:
2142            CAT
2143            / \
2144           /   \
2145    <reg_exp>  EOR
2146
2147    CAT means concatenation.
2148    EOR means end of regular expression.  */
2149
2150 static bin_tree_t *
2151 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152        reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = preg->buffer;
2155   bin_tree_t *tree, *eor, *root;
2156   re_token_t current_token;
2157   dfa->syntax = syntax;
2158   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161     return NULL;
2162   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163   if (tree != NULL)
2164     root = create_tree (dfa, tree, eor, CONCAT);
2165   else
2166     root = eor;
2167   if (BE (eor == NULL || root == NULL, 0))
2168     {
2169       *err = REG_ESPACE;
2170       return NULL;
2171     }
2172   return root;
2173 }
2174
2175 /* This function build the following tree, from regular expression
2176    <branch1>|<branch2>:
2177            ALT
2178            / \
2179           /   \
2180    <branch1> <branch2>
2181
2182    ALT means alternative, which represents the operator '|'.  */
2183
2184 static bin_tree_t *
2185 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 {
2188   re_dfa_t *dfa = preg->buffer;
2189   bin_tree_t *tree, *branch = NULL;
2190   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2191   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192     return NULL;
2193
2194   while (token->type == OP_ALT)
2195     {
2196       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2197       if (token->type != OP_ALT && token->type != END_OF_RE
2198           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199         {
2200           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2201           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202             {
2203               if (tree != NULL)
2204                 postorder (tree, free_tree, NULL);
2205               return NULL;
2206             }
2207         }
2208       else
2209         branch = NULL;
2210       tree = create_tree (dfa, tree, branch, OP_ALT);
2211       if (BE (tree == NULL, 0))
2212         {
2213           *err = REG_ESPACE;
2214           return NULL;
2215         }
2216     }
2217   return tree;
2218 }
2219
2220 /* This function build the following tree, from regular expression
2221    <exp1><exp2>:
2222         CAT
2223         / \
2224        /   \
2225    <exp1> <exp2>
2226
2227    CAT means concatenation.  */
2228
2229 static bin_tree_t *
2230 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2231               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2232 {
2233   bin_tree_t *tree, *expr;
2234   re_dfa_t *dfa = preg->buffer;
2235   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2236   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2237     return NULL;
2238
2239   while (token->type != OP_ALT && token->type != END_OF_RE
2240          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2241     {
2242       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2243       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2244         {
2245           if (tree != NULL)
2246             postorder (tree, free_tree, NULL);
2247           return NULL;
2248         }
2249       if (tree != NULL && expr != NULL)
2250         {
2251           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2252           if (newtree == NULL)
2253             {
2254               postorder (expr, free_tree, NULL);
2255               postorder (tree, free_tree, NULL);
2256               *err = REG_ESPACE;
2257               return NULL;
2258             }
2259           tree = newtree;
2260         }
2261       else if (tree == NULL)
2262         tree = expr;
2263       /* Otherwise expr == NULL, we don't need to create new tree.  */
2264     }
2265   return tree;
2266 }
2267
2268 /* This function build the following tree, from regular expression a*:
2269          *
2270          |
2271          a
2272 */
2273
2274 static bin_tree_t *
2275 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2276                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2277 {
2278   re_dfa_t *dfa = preg->buffer;
2279   bin_tree_t *tree;
2280   switch (token->type)
2281     {
2282     case CHARACTER:
2283       tree = create_token_tree (dfa, NULL, NULL, token);
2284       if (BE (tree == NULL, 0))
2285         {
2286           *err = REG_ESPACE;
2287           return NULL;
2288         }
2289 #ifdef RE_ENABLE_I18N
2290       if (dfa->mb_cur_max > 1)
2291         {
2292           while (!re_string_eoi (regexp)
2293                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2294             {
2295               bin_tree_t *mbc_remain;
2296               fetch_token (token, regexp, syntax);
2297               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2298               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2299               if (BE (mbc_remain == NULL || tree == NULL, 0))
2300                 {
2301                   *err = REG_ESPACE;
2302                   return NULL;
2303                 }
2304             }
2305         }
2306 #endif
2307       break;
2308     case OP_OPEN_SUBEXP:
2309       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2310       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2311         return NULL;
2312       break;
2313     case OP_OPEN_BRACKET:
2314       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2315       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2316         return NULL;
2317       break;
2318     case OP_BACK_REF:
2319       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2320         {
2321           *err = REG_ESUBREG;
2322           return NULL;
2323         }
2324       dfa->used_bkref_map |= 1 << token->opr.idx;
2325       tree = create_token_tree (dfa, NULL, NULL, token);
2326       if (BE (tree == NULL, 0))
2327         {
2328           *err = REG_ESPACE;
2329           return NULL;
2330         }
2331       ++dfa->nbackref;
2332       dfa->has_mb_node = 1;
2333       break;
2334     case OP_OPEN_DUP_NUM:
2335       if (syntax & RE_CONTEXT_INVALID_DUP)
2336         {
2337           *err = REG_BADRPT;
2338           return NULL;
2339         }
2340       /* FALLTHROUGH */
2341     case OP_DUP_ASTERISK:
2342     case OP_DUP_PLUS:
2343     case OP_DUP_QUESTION:
2344       if (syntax & RE_CONTEXT_INVALID_OPS)
2345         {
2346           *err = REG_BADRPT;
2347           return NULL;
2348         }
2349       else if (syntax & RE_CONTEXT_INDEP_OPS)
2350         {
2351           fetch_token (token, regexp, syntax);
2352           return parse_expression (regexp, preg, token, syntax, nest, err);
2353         }
2354       /* else fall through  */
2355     case OP_CLOSE_SUBEXP:
2356       if ((token->type == OP_CLOSE_SUBEXP) &&
2357           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2358         {
2359           *err = REG_ERPAREN;
2360           return NULL;
2361         }
2362       /* else fall through  */
2363     case OP_CLOSE_DUP_NUM:
2364       /* We treat it as a normal character.  */
2365
2366       /* Then we can these characters as normal characters.  */
2367       token->type = CHARACTER;
2368       /* mb_partial and word_char bits should be initialized already
2369          by peek_token.  */
2370       tree = create_token_tree (dfa, NULL, NULL, token);
2371       if (BE (tree == NULL, 0))
2372         {
2373           *err = REG_ESPACE;
2374           return NULL;
2375         }
2376       break;
2377     case ANCHOR:
2378       if ((token->opr.ctx_type
2379            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2380           && dfa->word_ops_used == 0)
2381         init_word_char (dfa);
2382       if (token->opr.ctx_type == WORD_DELIM
2383           || token->opr.ctx_type == NOT_WORD_DELIM)
2384         {
2385           bin_tree_t *tree_first, *tree_last;
2386           if (token->opr.ctx_type == WORD_DELIM)
2387             {
2388               token->opr.ctx_type = WORD_FIRST;
2389               tree_first = create_token_tree (dfa, NULL, NULL, token);
2390               token->opr.ctx_type = WORD_LAST;
2391             }
2392           else
2393             {
2394               token->opr.ctx_type = INSIDE_WORD;
2395               tree_first = create_token_tree (dfa, NULL, NULL, token);
2396               token->opr.ctx_type = INSIDE_NOTWORD;
2397             }
2398           tree_last = create_token_tree (dfa, NULL, NULL, token);
2399           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2400           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2401             {
2402               *err = REG_ESPACE;
2403               return NULL;
2404             }
2405         }
2406       else
2407         {
2408           tree = create_token_tree (dfa, NULL, NULL, token);
2409           if (BE (tree == NULL, 0))
2410             {
2411               *err = REG_ESPACE;
2412               return NULL;
2413             }
2414         }
2415       /* We must return here, since ANCHORs can't be followed
2416          by repetition operators.
2417          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2418              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2419       fetch_token (token, regexp, syntax);
2420       return tree;
2421     case OP_PERIOD:
2422       tree = create_token_tree (dfa, NULL, NULL, token);
2423       if (BE (tree == NULL, 0))
2424         {
2425           *err = REG_ESPACE;
2426           return NULL;
2427         }
2428       if (dfa->mb_cur_max > 1)
2429         dfa->has_mb_node = 1;
2430       break;
2431     case OP_WORD:
2432     case OP_NOTWORD:
2433       tree = build_charclass_op (dfa, regexp->trans,
2434                                  "alnum",
2435                                  "_",
2436                                  token->type == OP_NOTWORD, err);
2437       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438         return NULL;
2439       break;
2440     case OP_SPACE:
2441     case OP_NOTSPACE:
2442       tree = build_charclass_op (dfa, regexp->trans,
2443                                  "space",
2444                                  "",
2445                                  token->type == OP_NOTSPACE, err);
2446       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2447         return NULL;
2448       break;
2449     case OP_ALT:
2450     case END_OF_RE:
2451       return NULL;
2452     case BACK_SLASH:
2453       *err = REG_EESCAPE;
2454       return NULL;
2455     default:
2456       /* Must not happen?  */
2457 #ifdef DEBUG
2458       assert (0);
2459 #endif
2460       return NULL;
2461     }
2462   fetch_token (token, regexp, syntax);
2463
2464   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2465          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2466     {
2467       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2468                                            syntax, err);
2469       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2470         {
2471           if (tree != NULL)
2472             postorder (tree, free_tree, NULL);
2473           return NULL;
2474         }
2475       tree = dup_tree;
2476       /* In BRE consecutive duplications are not allowed.  */
2477       if ((syntax & RE_CONTEXT_INVALID_DUP)
2478           && (token->type == OP_DUP_ASTERISK
2479               || token->type == OP_OPEN_DUP_NUM))
2480         {
2481           if (tree != NULL)
2482             postorder (tree, free_tree, NULL);
2483           *err = REG_BADRPT;
2484           return NULL;
2485         }
2486     }
2487
2488   return tree;
2489 }
2490
2491 /* This function build the following tree, from regular expression
2492    (<reg_exp>):
2493          SUBEXP
2494             |
2495         <reg_exp>
2496 */
2497
2498 static bin_tree_t *
2499 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2500                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2501 {
2502   re_dfa_t *dfa = preg->buffer;
2503   bin_tree_t *tree;
2504   size_t cur_nsub;
2505   cur_nsub = preg->re_nsub++;
2506
2507   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2508
2509   /* The subexpression may be a null string.  */
2510   if (token->type == OP_CLOSE_SUBEXP)
2511     tree = NULL;
2512   else
2513     {
2514       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2515       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2516         {
2517           if (tree != NULL)
2518             postorder (tree, free_tree, NULL);
2519           *err = REG_EPAREN;
2520         }
2521       if (BE (*err != REG_NOERROR, 0))
2522         return NULL;
2523     }
2524
2525   if (cur_nsub <= '9' - '1')
2526     dfa->completed_bkref_map |= 1 << cur_nsub;
2527
2528   tree = create_tree (dfa, tree, NULL, SUBEXP);
2529   if (BE (tree == NULL, 0))
2530     {
2531       *err = REG_ESPACE;
2532       return NULL;
2533     }
2534   tree->token.opr.idx = cur_nsub;
2535   return tree;
2536 }
2537
2538 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2539
2540 static bin_tree_t *
2541 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2542               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2543 {
2544   bin_tree_t *tree = NULL, *old_tree = NULL;
2545   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2546   re_token_t start_token = *token;
2547
2548   if (token->type == OP_OPEN_DUP_NUM)
2549     {
2550       end = 0;
2551       start = fetch_number (regexp, token, syntax);
2552       if (start == REG_MISSING)
2553         {
2554           if (token->type == CHARACTER && token->opr.c == ',')
2555             start = 0; /* We treat "{,m}" as "{0,m}".  */
2556           else
2557             {
2558               *err = REG_BADBR; /* <re>{} is invalid.  */
2559               return NULL;
2560             }
2561         }
2562       if (BE (start != REG_ERROR, 1))
2563         {
2564           /* We treat "{n}" as "{n,n}".  */
2565           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2566                  : ((token->type == CHARACTER && token->opr.c == ',')
2567                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2568         }
2569       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2570         {
2571           /* Invalid sequence.  */
2572           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2573             {
2574               if (token->type == END_OF_RE)
2575                 *err = REG_EBRACE;
2576               else
2577                 *err = REG_BADBR;
2578
2579               return NULL;
2580             }
2581
2582           /* If the syntax bit is set, rollback.  */
2583           re_string_set_index (regexp, start_idx);
2584           *token = start_token;
2585           token->type = CHARACTER;
2586           /* mb_partial and word_char bits should be already initialized by
2587              peek_token.  */
2588           return elem;
2589         }
2590
2591       if (BE ((end != REG_MISSING && start > end)
2592               || token->type != OP_CLOSE_DUP_NUM, 0))
2593         {
2594           /* First number greater than second.  */
2595           *err = REG_BADBR;
2596           return NULL;
2597         }
2598
2599       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2600         {
2601           *err = REG_ESIZE;
2602           return NULL;
2603         }
2604     }
2605   else
2606     {
2607       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2608       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2609     }
2610
2611   fetch_token (token, regexp, syntax);
2612
2613   if (BE (elem == NULL, 0))
2614     return NULL;
2615   if (BE (start == 0 && end == 0, 0))
2616     {
2617       postorder (elem, free_tree, NULL);
2618       return NULL;
2619     }
2620
2621   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2622   if (BE (start > 0, 0))
2623     {
2624       tree = elem;
2625       for (i = 2; i <= start; ++i)
2626         {
2627           elem = duplicate_tree (elem, dfa);
2628           tree = create_tree (dfa, tree, elem, CONCAT);
2629           if (BE (elem == NULL || tree == NULL, 0))
2630             goto parse_dup_op_espace;
2631         }
2632
2633       if (start == end)
2634         return tree;
2635
2636       /* Duplicate ELEM before it is marked optional.  */
2637       elem = duplicate_tree (elem, dfa);
2638       if (BE (elem == NULL, 0))
2639         goto parse_dup_op_espace;
2640       old_tree = tree;
2641     }
2642   else
2643     old_tree = NULL;
2644
2645   if (elem->token.type == SUBEXP)
2646     {
2647       uintptr_t subidx = elem->token.opr.idx;
2648       postorder (elem, mark_opt_subexp, (void *) subidx);
2649     }
2650
2651   tree = create_tree (dfa, elem, NULL,
2652                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2653   if (BE (tree == NULL, 0))
2654     goto parse_dup_op_espace;
2655
2656 /* From gnulib's "intprops.h":
2657    True if the arithmetic type T is signed.  */
2658 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2659
2660   /* This loop is actually executed only when end != REG_MISSING,
2661      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2662      already created the start+1-th copy.  */
2663   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2664     for (i = start + 2; i <= end; ++i)
2665       {
2666         elem = duplicate_tree (elem, dfa);
2667         tree = create_tree (dfa, tree, elem, CONCAT);
2668         if (BE (elem == NULL || tree == NULL, 0))
2669           goto parse_dup_op_espace;
2670
2671         tree = create_tree (dfa, tree, NULL, OP_ALT);
2672         if (BE (tree == NULL, 0))
2673           goto parse_dup_op_espace;
2674       }
2675
2676   if (old_tree)
2677     tree = create_tree (dfa, old_tree, tree, CONCAT);
2678
2679   return tree;
2680
2681  parse_dup_op_espace:
2682   *err = REG_ESPACE;
2683   return NULL;
2684 }
2685
2686 /* Size of the names for collating symbol/equivalence_class/character_class.
2687    I'm not sure, but maybe enough.  */
2688 #define BRACKET_NAME_BUF_SIZE 32
2689
2690 #ifndef _LIBC
2691   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2692      Build the range expression which starts from START_ELEM, and ends
2693      at END_ELEM.  The result are written to MBCSET and SBCSET.
2694      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2695      mbcset->range_ends, is a pointer argument since we may
2696      update it.  */
2697
2698 static reg_errcode_t
2699 internal_function
2700 # ifdef RE_ENABLE_I18N
2701 build_range_exp (const reg_syntax_t syntax,
2702                  bitset_t sbcset,
2703                  re_charset_t *mbcset,
2704                  Idx *range_alloc,
2705                  const bracket_elem_t *start_elem,
2706                  const bracket_elem_t *end_elem)
2707 # else /* not RE_ENABLE_I18N */
2708 build_range_exp (const reg_syntax_t syntax,
2709                  bitset_t sbcset,
2710                  const bracket_elem_t *start_elem,
2711                  const bracket_elem_t *end_elem)
2712 # endif /* not RE_ENABLE_I18N */
2713 {
2714   unsigned int start_ch, end_ch;
2715   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2716   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2717           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2718           0))
2719     return REG_ERANGE;
2720
2721   /* We can handle no multi character collating elements without libc
2722      support.  */
2723   if (BE ((start_elem->type == COLL_SYM
2724            && strlen ((char *) start_elem->opr.name) > 1)
2725           || (end_elem->type == COLL_SYM
2726               && strlen ((char *) end_elem->opr.name) > 1), 0))
2727     return REG_ECOLLATE;
2728
2729 # ifdef RE_ENABLE_I18N
2730   {
2731     wchar_t wc;
2732     wint_t start_wc;
2733     wint_t end_wc;
2734
2735     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2736                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2737                    : 0));
2738     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2739               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2740                  : 0));
2741     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2742                 ? __btowc (start_ch) : start_elem->opr.wch);
2743     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2744               ? __btowc (end_ch) : end_elem->opr.wch);
2745     if (start_wc == WEOF || end_wc == WEOF)
2746       return REG_ECOLLATE;
2747     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2748       return REG_ERANGE;
2749
2750     /* Got valid collation sequence values, add them as a new entry.
2751        However, for !_LIBC we have no collation elements: if the
2752        character set is single byte, the single byte character set
2753        that we build below suffices.  parse_bracket_exp passes
2754        no MBCSET if dfa->mb_cur_max == 1.  */
2755     if (mbcset)
2756       {
2757         /* Check the space of the arrays.  */
2758         if (BE (*range_alloc == mbcset->nranges, 0))
2759           {
2760             /* There is not enough space, need realloc.  */
2761             wchar_t *new_array_start, *new_array_end;
2762             Idx new_nranges;
2763
2764             /* +1 in case of mbcset->nranges is 0.  */
2765             new_nranges = 2 * mbcset->nranges + 1;
2766             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2767                are NULL if *range_alloc == 0.  */
2768             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2769                                           new_nranges);
2770             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2771                                         new_nranges);
2772
2773             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2774               return REG_ESPACE;
2775
2776             mbcset->range_starts = new_array_start;
2777             mbcset->range_ends = new_array_end;
2778             *range_alloc = new_nranges;
2779           }
2780
2781         mbcset->range_starts[mbcset->nranges] = start_wc;
2782         mbcset->range_ends[mbcset->nranges++] = end_wc;
2783       }
2784
2785     /* Build the table for single byte characters.  */
2786     for (wc = 0; wc < SBC_MAX; ++wc)
2787       {
2788         if (start_wc <= wc && wc <= end_wc)
2789           bitset_set (sbcset, wc);
2790       }
2791   }
2792 # else /* not RE_ENABLE_I18N */
2793   {
2794     unsigned int ch;
2795     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2796                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2797                    : 0));
2798     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2799               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2800                  : 0));
2801     if (start_ch > end_ch)
2802       return REG_ERANGE;
2803     /* Build the table for single byte characters.  */
2804     for (ch = 0; ch < SBC_MAX; ++ch)
2805       if (start_ch <= ch  && ch <= end_ch)
2806         bitset_set (sbcset, ch);
2807   }
2808 # endif /* not RE_ENABLE_I18N */
2809   return REG_NOERROR;
2810 }
2811 #endif /* not _LIBC */
2812
2813 #ifndef _LIBC
2814 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2815    Build the collating element which is represented by NAME.
2816    The result are written to MBCSET and SBCSET.
2817    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2818    pointer argument since we may update it.  */
2819
2820 static reg_errcode_t
2821 internal_function
2822 # ifdef RE_ENABLE_I18N
2823 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2824                         Idx *coll_sym_alloc, const unsigned char *name)
2825 # else /* not RE_ENABLE_I18N */
2826 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2827 # endif /* not RE_ENABLE_I18N */
2828 {
2829   size_t name_len = strlen ((const char *) name);
2830   if (BE (name_len != 1, 0))
2831     return REG_ECOLLATE;
2832   else
2833     {
2834       bitset_set (sbcset, name[0]);
2835       return REG_NOERROR;
2836     }
2837 }
2838 #endif /* not _LIBC */
2839
2840 /* This function parse bracket expression like "[abc]", "[a-c]",
2841    "[[.a-a.]]" etc.  */
2842
2843 static bin_tree_t *
2844 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2845                    reg_syntax_t syntax, reg_errcode_t *err)
2846 {
2847 #ifdef _LIBC
2848   const unsigned char *collseqmb;
2849   const char *collseqwc;
2850   uint32_t nrules;
2851   int32_t table_size;
2852   const int32_t *symb_table;
2853   const unsigned char *extra;
2854
2855   /* Local function for parse_bracket_exp used in _LIBC environment.
2856      Seek the collating symbol entry corresponding to NAME.
2857      Return the index of the symbol in the SYMB_TABLE,
2858      or -1 if not found.  */
2859
2860   auto inline int32_t
2861   __attribute__ ((always_inline))
2862   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2863     {
2864       int32_t elem;
2865
2866       for (elem = 0; elem < table_size; elem++)
2867         if (symb_table[2 * elem] != 0)
2868           {
2869             int32_t idx = symb_table[2 * elem + 1];
2870             /* Skip the name of collating element name.  */
2871             idx += 1 + extra[idx];
2872             if (/* Compare the length of the name.  */
2873                 name_len == extra[idx]
2874                 /* Compare the name.  */
2875                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2876               /* Yep, this is the entry.  */
2877               return elem;
2878           }
2879       return -1;
2880     }
2881
2882   /* Local function for parse_bracket_exp used in _LIBC environment.
2883      Look up the collation sequence value of BR_ELEM.
2884      Return the value if succeeded, UINT_MAX otherwise.  */
2885
2886   auto inline unsigned int
2887   __attribute__ ((always_inline))
2888   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2889     {
2890       if (br_elem->type == SB_CHAR)
2891         {
2892           /*
2893           if (MB_CUR_MAX == 1)
2894           */
2895           if (nrules == 0)
2896             return collseqmb[br_elem->opr.ch];
2897           else
2898             {
2899               wint_t wc = __btowc (br_elem->opr.ch);
2900               return __collseq_table_lookup (collseqwc, wc);
2901             }
2902         }
2903       else if (br_elem->type == MB_CHAR)
2904         {
2905           if (nrules != 0)
2906             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2907         }
2908       else if (br_elem->type == COLL_SYM)
2909         {
2910           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2911           if (nrules != 0)
2912             {
2913               int32_t elem, idx;
2914               elem = seek_collating_symbol_entry (br_elem->opr.name,
2915                                                   sym_name_len);
2916               if (elem != -1)
2917                 {
2918                   /* We found the entry.  */
2919                   idx = symb_table[2 * elem + 1];
2920                   /* Skip the name of collating element name.  */
2921                   idx += 1 + extra[idx];
2922                   /* Skip the byte sequence of the collating element.  */
2923                   idx += 1 + extra[idx];
2924                   /* Adjust for the alignment.  */
2925                   idx = (idx + 3) & ~3;
2926                   /* Skip the multibyte collation sequence value.  */
2927                   idx += sizeof (unsigned int);
2928                   /* Skip the wide char sequence of the collating element.  */
2929                   idx += sizeof (unsigned int) *
2930                     (1 + *(unsigned int *) (extra + idx));
2931                   /* Return the collation sequence value.  */
2932                   return *(unsigned int *) (extra + idx);
2933                 }
2934               else if (sym_name_len == 1)
2935                 {
2936                   /* No valid character.  Match it as a single byte
2937                      character.  */
2938                   return collseqmb[br_elem->opr.name[0]];
2939                 }
2940             }
2941           else if (sym_name_len == 1)
2942             return collseqmb[br_elem->opr.name[0]];
2943         }
2944       return UINT_MAX;
2945     }
2946
2947   /* Local function for parse_bracket_exp used in _LIBC environment.
2948      Build the range expression which starts from START_ELEM, and ends
2949      at END_ELEM.  The result are written to MBCSET and SBCSET.
2950      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2951      mbcset->range_ends, is a pointer argument since we may
2952      update it.  */
2953
2954   auto inline reg_errcode_t
2955   __attribute__ ((always_inline))
2956   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2957                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2958     {
2959       unsigned int ch;
2960       uint32_t start_collseq;
2961       uint32_t end_collseq;
2962
2963       /* Equivalence Classes and Character Classes can't be a range
2964          start/end.  */
2965       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2966               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2967               0))
2968         return REG_ERANGE;
2969
2970       /* FIXME: Implement rational ranges here, too.  */
2971       start_collseq = lookup_collation_sequence_value (start_elem);
2972       end_collseq = lookup_collation_sequence_value (end_elem);
2973       /* Check start/end collation sequence values.  */
2974       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2975         return REG_ECOLLATE;
2976       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2977         return REG_ERANGE;
2978
2979       /* Got valid collation sequence values, add them as a new entry.
2980          However, if we have no collation elements, and the character set
2981          is single byte, the single byte character set that we
2982          build below suffices. */
2983       if (nrules > 0 || dfa->mb_cur_max > 1)
2984         {
2985           /* Check the space of the arrays.  */
2986           if (BE (*range_alloc == mbcset->nranges, 0))
2987             {
2988               /* There is not enough space, need realloc.  */
2989               uint32_t *new_array_start;
2990               uint32_t *new_array_end;
2991               Idx new_nranges;
2992
2993               /* +1 in case of mbcset->nranges is 0.  */
2994               new_nranges = 2 * mbcset->nranges + 1;
2995               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2996                                             new_nranges);
2997               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2998                                           new_nranges);
2999
3000               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3001                 return REG_ESPACE;
3002
3003               mbcset->range_starts = new_array_start;
3004               mbcset->range_ends = new_array_end;
3005               *range_alloc = new_nranges;
3006             }
3007
3008           mbcset->range_starts[mbcset->nranges] = start_collseq;
3009           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3010         }
3011
3012       /* Build the table for single byte characters.  */
3013       for (ch = 0; ch < SBC_MAX; ch++)
3014         {
3015           uint32_t ch_collseq;
3016           /*
3017           if (MB_CUR_MAX == 1)
3018           */
3019           if (nrules == 0)
3020             ch_collseq = collseqmb[ch];
3021           else
3022             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3023           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3024             bitset_set (sbcset, ch);
3025         }
3026       return REG_NOERROR;
3027     }
3028
3029   /* Local function for parse_bracket_exp used in _LIBC environment.
3030      Build the collating element which is represented by NAME.
3031      The result are written to MBCSET and SBCSET.
3032      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3033      pointer argument since we may update it.  */
3034
3035   auto inline reg_errcode_t
3036   __attribute__ ((always_inline))
3037   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3038                           Idx *coll_sym_alloc, const unsigned char *name)
3039     {
3040       int32_t elem, idx;
3041       size_t name_len = strlen ((const char *) name);
3042       if (nrules != 0)
3043         {
3044           elem = seek_collating_symbol_entry (name, name_len);
3045           if (elem != -1)
3046             {
3047               /* We found the entry.  */
3048               idx = symb_table[2 * elem + 1];
3049               /* Skip the name of collating element name.  */
3050               idx += 1 + extra[idx];
3051             }
3052           else if (name_len == 1)
3053             {
3054               /* No valid character, treat it as a normal
3055                  character.  */
3056               bitset_set (sbcset, name[0]);
3057               return REG_NOERROR;
3058             }
3059           else
3060             return REG_ECOLLATE;
3061
3062           /* Got valid collation sequence, add it as a new entry.  */
3063           /* Check the space of the arrays.  */
3064           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3065             {
3066               /* Not enough, realloc it.  */
3067               /* +1 in case of mbcset->ncoll_syms is 0.  */
3068               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3069               /* Use realloc since mbcset->coll_syms is NULL
3070                  if *alloc == 0.  */
3071               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3072                                                    new_coll_sym_alloc);
3073               if (BE (new_coll_syms == NULL, 0))
3074                 return REG_ESPACE;
3075               mbcset->coll_syms = new_coll_syms;
3076               *coll_sym_alloc = new_coll_sym_alloc;
3077             }
3078           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3079           return REG_NOERROR;
3080         }
3081       else
3082         {
3083           if (BE (name_len != 1, 0))
3084             return REG_ECOLLATE;
3085           else
3086             {
3087               bitset_set (sbcset, name[0]);
3088               return REG_NOERROR;
3089             }
3090         }
3091     }
3092 #endif
3093
3094   re_token_t br_token;
3095   re_bitset_ptr_t sbcset;
3096 #ifdef RE_ENABLE_I18N
3097   re_charset_t *mbcset;
3098   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3099   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3100 #endif /* not RE_ENABLE_I18N */
3101   bool non_match = false;
3102   bin_tree_t *work_tree;
3103   int token_len;
3104   bool first_round = true;
3105 #ifdef _LIBC
3106   collseqmb = (const unsigned char *)
3107     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3108   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3109   if (nrules)
3110     {
3111       /*
3112       if (MB_CUR_MAX > 1)
3113       */
3114       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3115       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3116       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3117                                                   _NL_COLLATE_SYMB_TABLEMB);
3118       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3119                                                    _NL_COLLATE_SYMB_EXTRAMB);
3120     }
3121 #endif
3122   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3123 #ifdef RE_ENABLE_I18N
3124   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3125 #endif /* RE_ENABLE_I18N */
3126 #ifdef RE_ENABLE_I18N
3127   if (BE (sbcset == NULL || mbcset == NULL, 0))
3128 #else
3129   if (BE (sbcset == NULL, 0))
3130 #endif /* RE_ENABLE_I18N */
3131     {
3132       re_free (sbcset);
3133 #ifdef RE_ENABLE_I18N
3134       re_free (mbcset);
3135 #endif
3136       *err = REG_ESPACE;
3137       return NULL;
3138     }
3139
3140   token_len = peek_token_bracket (token, regexp, syntax);
3141   if (BE (token->type == END_OF_RE, 0))
3142     {
3143       *err = REG_BADPAT;
3144       goto parse_bracket_exp_free_return;
3145     }
3146   if (token->type == OP_NON_MATCH_LIST)
3147     {
3148 #ifdef RE_ENABLE_I18N
3149       mbcset->non_match = 1;
3150 #endif /* not RE_ENABLE_I18N */
3151       non_match = true;
3152       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3153         bitset_set (sbcset, '\n');
3154       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3155       token_len = peek_token_bracket (token, regexp, syntax);
3156       if (BE (token->type == END_OF_RE, 0))
3157         {
3158           *err = REG_BADPAT;
3159           goto parse_bracket_exp_free_return;
3160         }
3161     }
3162
3163   /* We treat the first ']' as a normal character.  */
3164   if (token->type == OP_CLOSE_BRACKET)
3165     token->type = CHARACTER;
3166
3167   while (1)
3168     {
3169       bracket_elem_t start_elem, end_elem;
3170       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3171       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3172       reg_errcode_t ret;
3173       int token_len2 = 0;
3174       bool is_range_exp = false;
3175       re_token_t token2;
3176
3177       start_elem.opr.name = start_name_buf;
3178       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3179                                    syntax, first_round);
3180       if (BE (ret != REG_NOERROR, 0))
3181         {
3182           *err = ret;
3183           goto parse_bracket_exp_free_return;
3184         }
3185       first_round = false;
3186
3187       /* Get information about the next token.  We need it in any case.  */
3188       token_len = peek_token_bracket (token, regexp, syntax);
3189
3190       /* Do not check for ranges if we know they are not allowed.  */
3191       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3192         {
3193           if (BE (token->type == END_OF_RE, 0))
3194             {
3195               *err = REG_EBRACK;
3196               goto parse_bracket_exp_free_return;
3197             }
3198           if (token->type == OP_CHARSET_RANGE)
3199             {
3200               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3201               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3202               if (BE (token2.type == END_OF_RE, 0))
3203                 {
3204                   *err = REG_EBRACK;
3205                   goto parse_bracket_exp_free_return;
3206                 }
3207               if (token2.type == OP_CLOSE_BRACKET)
3208                 {
3209                   /* We treat the last '-' as a normal character.  */
3210                   re_string_skip_bytes (regexp, -token_len);
3211                   token->type = CHARACTER;
3212                 }
3213               else
3214                 is_range_exp = true;
3215             }
3216         }
3217
3218       if (is_range_exp == true)
3219         {
3220           end_elem.opr.name = end_name_buf;
3221           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3222                                        dfa, syntax, true);
3223           if (BE (ret != REG_NOERROR, 0))
3224             {
3225               *err = ret;
3226               goto parse_bracket_exp_free_return;
3227             }
3228
3229           token_len = peek_token_bracket (token, regexp, syntax);
3230
3231 #ifdef _LIBC
3232           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3233                                   &start_elem, &end_elem);
3234 #else
3235 # ifdef RE_ENABLE_I18N
3236           *err = build_range_exp (syntax, sbcset,
3237                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3238                                   &range_alloc, &start_elem, &end_elem);
3239 # else
3240           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3241 # endif
3242 #endif /* RE_ENABLE_I18N */
3243           if (BE (*err != REG_NOERROR, 0))
3244             goto parse_bracket_exp_free_return;
3245         }
3246       else
3247         {
3248           switch (start_elem.type)
3249             {
3250             case SB_CHAR:
3251               bitset_set (sbcset, start_elem.opr.ch);
3252               break;
3253 #ifdef RE_ENABLE_I18N
3254             case MB_CHAR:
3255               /* Check whether the array has enough space.  */
3256               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3257                 {
3258                   wchar_t *new_mbchars;
3259                   /* Not enough, realloc it.  */
3260                   /* +1 in case of mbcset->nmbchars is 0.  */
3261                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3262                   /* Use realloc since array is NULL if *alloc == 0.  */
3263                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3264                                             mbchar_alloc);
3265                   if (BE (new_mbchars == NULL, 0))
3266                     goto parse_bracket_exp_espace;
3267                   mbcset->mbchars = new_mbchars;
3268                 }
3269               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3270               break;
3271 #endif /* RE_ENABLE_I18N */
3272             case EQUIV_CLASS:
3273               *err = build_equiv_class (sbcset,
3274 #ifdef RE_ENABLE_I18N
3275                                         mbcset, &equiv_class_alloc,
3276 #endif /* RE_ENABLE_I18N */
3277                                         start_elem.opr.name);
3278               if (BE (*err != REG_NOERROR, 0))
3279                 goto parse_bracket_exp_free_return;
3280               break;
3281             case COLL_SYM:
3282               *err = build_collating_symbol (sbcset,
3283 #ifdef RE_ENABLE_I18N
3284                                              mbcset, &coll_sym_alloc,
3285 #endif /* RE_ENABLE_I18N */
3286                                              start_elem.opr.name);
3287               if (BE (*err != REG_NOERROR, 0))
3288                 goto parse_bracket_exp_free_return;
3289               break;
3290             case CHAR_CLASS:
3291               *err = build_charclass (regexp->trans, sbcset,
3292 #ifdef RE_ENABLE_I18N
3293                                       mbcset, &char_class_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295                                       (const char *) start_elem.opr.name,
3296                                       syntax);
3297               if (BE (*err != REG_NOERROR, 0))
3298                goto parse_bracket_exp_free_return;
3299               break;
3300             default:
3301               assert (0);
3302               break;
3303             }
3304         }
3305       if (BE (token->type == END_OF_RE, 0))
3306         {
3307           *err = REG_EBRACK;
3308           goto parse_bracket_exp_free_return;
3309         }
3310       if (token->type == OP_CLOSE_BRACKET)
3311         break;
3312     }
3313
3314   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3315
3316   /* If it is non-matching list.  */
3317   if (non_match)
3318     bitset_not (sbcset);
3319
3320 #ifdef RE_ENABLE_I18N
3321   /* Ensure only single byte characters are set.  */
3322   if (dfa->mb_cur_max > 1)
3323     bitset_mask (sbcset, dfa->sb_char);
3324
3325   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3326       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3327                                                      || mbcset->non_match)))
3328     {
3329       bin_tree_t *mbc_tree;
3330       int sbc_idx;
3331       /* Build a tree for complex bracket.  */
3332       dfa->has_mb_node = 1;
3333       br_token.type = COMPLEX_BRACKET;
3334       br_token.opr.mbcset = mbcset;
3335       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3336       if (BE (mbc_tree == NULL, 0))
3337         goto parse_bracket_exp_espace;
3338       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3339         if (sbcset[sbc_idx])
3340           break;
3341       /* If there are no bits set in sbcset, there is no point
3342          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3343       if (sbc_idx < BITSET_WORDS)
3344         {
3345           /* Build a tree for simple bracket.  */
3346           br_token.type = SIMPLE_BRACKET;
3347           br_token.opr.sbcset = sbcset;
3348           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3349           if (BE (work_tree == NULL, 0))
3350             goto parse_bracket_exp_espace;
3351
3352           /* Then join them by ALT node.  */
3353           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3354           if (BE (work_tree == NULL, 0))
3355             goto parse_bracket_exp_espace;
3356         }
3357       else
3358         {
3359           re_free (sbcset);
3360           work_tree = mbc_tree;
3361         }
3362     }
3363   else
3364 #endif /* not RE_ENABLE_I18N */
3365     {
3366 #ifdef RE_ENABLE_I18N
3367       free_charset (mbcset);
3368 #endif
3369       /* Build a tree for simple bracket.  */
3370       br_token.type = SIMPLE_BRACKET;
3371       br_token.opr.sbcset = sbcset;
3372       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3373       if (BE (work_tree == NULL, 0))
3374         goto parse_bracket_exp_espace;
3375     }
3376   return work_tree;
3377
3378  parse_bracket_exp_espace:
3379   *err = REG_ESPACE;
3380  parse_bracket_exp_free_return:
3381   re_free (sbcset);
3382 #ifdef RE_ENABLE_I18N
3383   free_charset (mbcset);
3384 #endif /* RE_ENABLE_I18N */
3385   return NULL;
3386 }
3387
3388 /* Parse an element in the bracket expression.  */
3389
3390 static reg_errcode_t
3391 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3392                        re_token_t *token, int token_len, re_dfa_t *dfa,
3393                        reg_syntax_t syntax, bool accept_hyphen)
3394 {
3395 #ifdef RE_ENABLE_I18N
3396   int cur_char_size;
3397   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3398   if (cur_char_size > 1)
3399     {
3400       elem->type = MB_CHAR;
3401       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3402       re_string_skip_bytes (regexp, cur_char_size);
3403       return REG_NOERROR;
3404     }
3405 #endif /* RE_ENABLE_I18N */
3406   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3407   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3408       || token->type == OP_OPEN_EQUIV_CLASS)
3409     return parse_bracket_symbol (elem, regexp, token);
3410   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3411     {
3412       /* A '-' must only appear as anything but a range indicator before
3413          the closing bracket.  Everything else is an error.  */
3414       re_token_t token2;
3415       (void) peek_token_bracket (&token2, regexp, syntax);
3416       if (token2.type != OP_CLOSE_BRACKET)
3417         /* The actual error value is not standardized since this whole
3418            case is undefined.  But ERANGE makes good sense.  */
3419         return REG_ERANGE;
3420     }
3421   elem->type = SB_CHAR;
3422   elem->opr.ch = token->opr.c;
3423   return REG_NOERROR;
3424 }
3425
3426 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3427    such as [:<character_class>:], [.<collating_element>.], and
3428    [=<equivalent_class>=].  */
3429
3430 static reg_errcode_t
3431 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3432                       re_token_t *token)
3433 {
3434   unsigned char ch, delim = token->opr.c;
3435   int i = 0;
3436   if (re_string_eoi(regexp))
3437     return REG_EBRACK;
3438   for (;; ++i)
3439     {
3440       if (i >= BRACKET_NAME_BUF_SIZE)
3441         return REG_EBRACK;
3442       if (token->type == OP_OPEN_CHAR_CLASS)
3443         ch = re_string_fetch_byte_case (regexp);
3444       else
3445         ch = re_string_fetch_byte (regexp);
3446       if (re_string_eoi(regexp))
3447         return REG_EBRACK;
3448       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3449         break;
3450       elem->opr.name[i] = ch;
3451     }
3452   re_string_skip_bytes (regexp, 1);
3453   elem->opr.name[i] = '\0';
3454   switch (token->type)
3455     {
3456     case OP_OPEN_COLL_ELEM:
3457       elem->type = COLL_SYM;
3458       break;
3459     case OP_OPEN_EQUIV_CLASS:
3460       elem->type = EQUIV_CLASS;
3461       break;
3462     case OP_OPEN_CHAR_CLASS:
3463       elem->type = CHAR_CLASS;
3464       break;
3465     default:
3466       break;
3467     }
3468   return REG_NOERROR;
3469 }
3470
3471   /* Helper function for parse_bracket_exp.
3472      Build the equivalence class which is represented by NAME.
3473      The result are written to MBCSET and SBCSET.
3474      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3475      is a pointer argument since we may update it.  */
3476
3477 static reg_errcode_t
3478 #ifdef RE_ENABLE_I18N
3479 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3480                    Idx *equiv_class_alloc, const unsigned char *name)
3481 #else /* not RE_ENABLE_I18N */
3482 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3483 #endif /* not RE_ENABLE_I18N */
3484 {
3485 #ifdef _LIBC
3486   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3487   if (nrules != 0)
3488     {
3489       const int32_t *table, *indirect;
3490       const unsigned char *weights, *extra, *cp;
3491       unsigned char char_buf[2];
3492       int32_t idx1, idx2;
3493       unsigned int ch;
3494       size_t len;
3495       /* This #include defines a local function!  */
3496 # include <locale/weight.h>
3497       /* Calculate the index for equivalence class.  */
3498       cp = name;
3499       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3500       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501                                                _NL_COLLATE_WEIGHTMB);
3502       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3503                                                    _NL_COLLATE_EXTRAMB);
3504       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3505                                                 _NL_COLLATE_INDIRECTMB);
3506       idx1 = findidx (&cp, -1);
3507       if (BE (idx1 == 0 || *cp != '\0', 0))
3508         /* This isn't a valid character.  */
3509         return REG_ECOLLATE;
3510
3511       /* Build single byte matching table for this equivalence class.  */
3512       len = weights[idx1 & 0xffffff];
3513       for (ch = 0; ch < SBC_MAX; ++ch)
3514         {
3515           char_buf[0] = ch;
3516           cp = char_buf;
3517           idx2 = findidx (&cp, 1);
3518 /*
3519           idx2 = table[ch];
3520 */
3521           if (idx2 == 0)
3522             /* This isn't a valid character.  */
3523             continue;
3524           /* Compare only if the length matches and the collation rule
3525              index is the same.  */
3526           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3527             {
3528               int cnt = 0;
3529
3530               while (cnt <= len &&
3531                      weights[(idx1 & 0xffffff) + 1 + cnt]
3532                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3533                 ++cnt;
3534
3535               if (cnt > len)
3536                 bitset_set (sbcset, ch);
3537             }
3538         }
3539       /* Check whether the array has enough space.  */
3540       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3541         {
3542           /* Not enough, realloc it.  */
3543           /* +1 in case of mbcset->nequiv_classes is 0.  */
3544           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3545           /* Use realloc since the array is NULL if *alloc == 0.  */
3546           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3547                                                    int32_t,
3548                                                    new_equiv_class_alloc);
3549           if (BE (new_equiv_classes == NULL, 0))
3550             return REG_ESPACE;
3551           mbcset->equiv_classes = new_equiv_classes;
3552           *equiv_class_alloc = new_equiv_class_alloc;
3553         }
3554       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3555     }
3556   else
3557 #endif /* _LIBC */
3558     {
3559       if (BE (strlen ((const char *) name) != 1, 0))
3560         return REG_ECOLLATE;
3561       bitset_set (sbcset, *name);
3562     }
3563   return REG_NOERROR;
3564 }
3565
3566   /* Helper function for parse_bracket_exp.
3567      Build the character class which is represented by NAME.
3568      The result are written to MBCSET and SBCSET.
3569      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3570      is a pointer argument since we may update it.  */
3571
3572 static reg_errcode_t
3573 #ifdef RE_ENABLE_I18N
3574 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3575                  re_charset_t *mbcset, Idx *char_class_alloc,
3576                  const char *class_name, reg_syntax_t syntax)
3577 #else /* not RE_ENABLE_I18N */
3578 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579                  const char *class_name, reg_syntax_t syntax)
3580 #endif /* not RE_ENABLE_I18N */
3581 {
3582   int i;
3583   const char *name = class_name;
3584
3585   /* In case of REG_ICASE "upper" and "lower" match the both of
3586      upper and lower cases.  */
3587   if ((syntax & RE_ICASE)
3588       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3589     name = "alpha";
3590
3591 #ifdef RE_ENABLE_I18N
3592   /* Check the space of the arrays.  */
3593   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3594     {
3595       /* Not enough, realloc it.  */
3596       /* +1 in case of mbcset->nchar_classes is 0.  */
3597       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3598       /* Use realloc since array is NULL if *alloc == 0.  */
3599       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3600                                                new_char_class_alloc);
3601       if (BE (new_char_classes == NULL, 0))
3602         return REG_ESPACE;
3603       mbcset->char_classes = new_char_classes;
3604       *char_class_alloc = new_char_class_alloc;
3605     }
3606   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3607 #endif /* RE_ENABLE_I18N */
3608
3609 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3610   do {                                          \
3611     if (BE (trans != NULL, 0))                  \
3612       {                                         \
3613         for (i = 0; i < SBC_MAX; ++i)           \
3614           if (ctype_func (i))                   \
3615             bitset_set (sbcset, trans[i]);      \
3616       }                                         \
3617     else                                        \
3618       {                                         \
3619         for (i = 0; i < SBC_MAX; ++i)           \
3620           if (ctype_func (i))                   \
3621             bitset_set (sbcset, i);             \
3622       }                                         \
3623   } while (0)
3624
3625   if (strcmp (name, "alnum") == 0)
3626     BUILD_CHARCLASS_LOOP (isalnum);
3627   else if (strcmp (name, "cntrl") == 0)
3628     BUILD_CHARCLASS_LOOP (iscntrl);
3629   else if (strcmp (name, "lower") == 0)
3630     BUILD_CHARCLASS_LOOP (islower);
3631   else if (strcmp (name, "space") == 0)
3632     BUILD_CHARCLASS_LOOP (isspace);
3633   else if (strcmp (name, "alpha") == 0)
3634     BUILD_CHARCLASS_LOOP (isalpha);
3635   else if (strcmp (name, "digit") == 0)
3636     BUILD_CHARCLASS_LOOP (isdigit);
3637   else if (strcmp (name, "print") == 0)
3638     BUILD_CHARCLASS_LOOP (isprint);
3639   else if (strcmp (name, "upper") == 0)
3640     BUILD_CHARCLASS_LOOP (isupper);
3641   else if (strcmp (name, "blank") == 0)
3642     BUILD_CHARCLASS_LOOP (isblank);
3643   else if (strcmp (name, "graph") == 0)
3644     BUILD_CHARCLASS_LOOP (isgraph);
3645   else if (strcmp (name, "punct") == 0)
3646     BUILD_CHARCLASS_LOOP (ispunct);
3647   else if (strcmp (name, "xdigit") == 0)
3648     BUILD_CHARCLASS_LOOP (isxdigit);
3649   else
3650     return REG_ECTYPE;
3651
3652   return REG_NOERROR;
3653 }
3654
3655 static bin_tree_t *
3656 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3657                     const char *class_name,
3658                     const char *extra, bool non_match,
3659                     reg_errcode_t *err)
3660 {
3661   re_bitset_ptr_t sbcset;
3662 #ifdef RE_ENABLE_I18N
3663   re_charset_t *mbcset;
3664   Idx alloc = 0;
3665 #endif /* not RE_ENABLE_I18N */
3666   reg_errcode_t ret;
3667   re_token_t br_token;
3668   bin_tree_t *tree;
3669
3670   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3671 #ifdef RE_ENABLE_I18N
3672   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3673 #endif /* RE_ENABLE_I18N */
3674
3675 #ifdef RE_ENABLE_I18N
3676   if (BE (sbcset == NULL || mbcset == NULL, 0))
3677 #else /* not RE_ENABLE_I18N */
3678   if (BE (sbcset == NULL, 0))
3679 #endif /* not RE_ENABLE_I18N */
3680     {
3681       *err = REG_ESPACE;
3682       return NULL;
3683     }
3684
3685   if (non_match)
3686     {
3687 #ifdef RE_ENABLE_I18N
3688       mbcset->non_match = 1;
3689 #endif /* not RE_ENABLE_I18N */
3690     }
3691
3692   /* We don't care the syntax in this case.  */
3693   ret = build_charclass (trans, sbcset,
3694 #ifdef RE_ENABLE_I18N
3695                          mbcset, &alloc,
3696 #endif /* RE_ENABLE_I18N */
3697                          class_name, 0);
3698
3699   if (BE (ret != REG_NOERROR, 0))
3700     {
3701       re_free (sbcset);
3702 #ifdef RE_ENABLE_I18N
3703       free_charset (mbcset);
3704 #endif /* RE_ENABLE_I18N */
3705       *err = ret;
3706       return NULL;
3707     }
3708   /* \w match '_' also.  */
3709   for (; *extra; extra++)
3710     bitset_set (sbcset, *extra);
3711
3712   /* If it is non-matching list.  */
3713   if (non_match)
3714     bitset_not (sbcset);
3715
3716 #ifdef RE_ENABLE_I18N
3717   /* Ensure only single byte characters are set.  */
3718   if (dfa->mb_cur_max > 1)
3719     bitset_mask (sbcset, dfa->sb_char);
3720 #endif
3721
3722   /* Build a tree for simple bracket.  */
3723   br_token.type = SIMPLE_BRACKET;
3724   br_token.opr.sbcset = sbcset;
3725   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726   if (BE (tree == NULL, 0))
3727     goto build_word_op_espace;
3728
3729 #ifdef RE_ENABLE_I18N
3730   if (dfa->mb_cur_max > 1)
3731     {
3732       bin_tree_t *mbc_tree;
3733       /* Build a tree for complex bracket.  */
3734       br_token.type = COMPLEX_BRACKET;
3735       br_token.opr.mbcset = mbcset;
3736       dfa->has_mb_node = 1;
3737       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3738       if (BE (mbc_tree == NULL, 0))
3739         goto build_word_op_espace;
3740       /* Then join them by ALT node.  */
3741       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3742       if (BE (mbc_tree != NULL, 1))
3743         return tree;
3744     }
3745   else
3746     {
3747       free_charset (mbcset);
3748       return tree;
3749     }
3750 #else /* not RE_ENABLE_I18N */
3751   return tree;
3752 #endif /* not RE_ENABLE_I18N */
3753
3754  build_word_op_espace:
3755   re_free (sbcset);
3756 #ifdef RE_ENABLE_I18N
3757   free_charset (mbcset);
3758 #endif /* RE_ENABLE_I18N */
3759   *err = REG_ESPACE;
3760   return NULL;
3761 }
3762
3763 /* This is intended for the expressions like "a{1,3}".
3764    Fetch a number from 'input', and return the number.
3765    Return REG_MISSING if the number field is empty like "{,1}".
3766    Return RE_DUP_MAX + 1 if the number field is too large.
3767    Return REG_ERROR if an error occurred.  */
3768
3769 static Idx
3770 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3771 {
3772   Idx num = REG_MISSING;
3773   unsigned char c;
3774   while (1)
3775     {
3776       fetch_token (token, input, syntax);
3777       c = token->opr.c;
3778       if (BE (token->type == END_OF_RE, 0))
3779         return REG_ERROR;
3780       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3781         break;
3782       num = ((token->type != CHARACTER || c < '0' || '9' < c
3783               || num == REG_ERROR)
3784              ? REG_ERROR
3785              : num == REG_MISSING
3786              ? c - '0'
3787              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3788     }
3789   return num;
3790 }
3791 \f
3792 #ifdef RE_ENABLE_I18N
3793 static void
3794 free_charset (re_charset_t *cset)
3795 {
3796   re_free (cset->mbchars);
3797 # ifdef _LIBC
3798   re_free (cset->coll_syms);
3799   re_free (cset->equiv_classes);
3800   re_free (cset->range_starts);
3801   re_free (cset->range_ends);
3802 # endif
3803   re_free (cset->char_classes);
3804   re_free (cset);
3805 }
3806 #endif /* RE_ENABLE_I18N */
3807 \f
3808 /* Functions for binary tree operation.  */
3809
3810 /* Create a tree node.  */
3811
3812 static bin_tree_t *
3813 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3814              re_token_type_t type)
3815 {
3816   re_token_t t;
3817   t.type = type;
3818   return create_token_tree (dfa, left, right, &t);
3819 }
3820
3821 static bin_tree_t *
3822 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823                    const re_token_t *token)
3824 {
3825   bin_tree_t *tree;
3826   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3827     {
3828       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829
3830       if (storage == NULL)
3831         return NULL;
3832       storage->next = dfa->str_tree_storage;
3833       dfa->str_tree_storage = storage;
3834       dfa->str_tree_storage_idx = 0;
3835     }
3836   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3838   tree->parent = NULL;
3839   tree->left = left;
3840   tree->right = right;
3841   tree->token = *token;
3842   tree->token.duplicated = 0;
3843   tree->token.opt_subexp = 0;
3844   tree->first = NULL;
3845   tree->next = NULL;
3846   tree->node_idx = REG_MISSING;
3847
3848   if (left != NULL)
3849     left->parent = tree;
3850   if (right != NULL)
3851     right->parent = tree;
3852   return tree;
3853 }
3854
3855 /* Mark the tree SRC as an optional subexpression.
3856    To be called from preorder or postorder.  */
3857
3858 static reg_errcode_t
3859 mark_opt_subexp (void *extra, bin_tree_t *node)
3860 {
3861   Idx idx = (uintptr_t) extra;
3862   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863     node->token.opt_subexp = 1;
3864
3865   return REG_NOERROR;
3866 }
3867
3868 /* Free the allocated memory inside NODE. */
3869
3870 static void
3871 free_token (re_token_t *node)
3872 {
3873 #ifdef RE_ENABLE_I18N
3874   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875     free_charset (node->opr.mbcset);
3876   else
3877 #endif /* RE_ENABLE_I18N */
3878     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879       re_free (node->opr.sbcset);
3880 }
3881
3882 /* Worker function for tree walking.  Free the allocated memory inside NODE
3883    and its children. */
3884
3885 static reg_errcode_t
3886 free_tree (void *extra, bin_tree_t *node)
3887 {
3888   free_token (&node->token);
3889   return REG_NOERROR;
3890 }
3891
3892
3893 /* Duplicate the node SRC, and return new node.  This is a preorder
3894    visit similar to the one implemented by the generic visitor, but
3895    we need more infrastructure to maintain two parallel trees --- so,
3896    it's easier to duplicate.  */
3897
3898 static bin_tree_t *
3899 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900 {
3901   const bin_tree_t *node;
3902   bin_tree_t *dup_root;
3903   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904
3905   for (node = root; ; )
3906     {
3907       /* Create a new tree and link it back to the current parent.  */
3908       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909       if (*p_new == NULL)
3910         return NULL;
3911       (*p_new)->parent = dup_node;
3912       (*p_new)->token.duplicated = 1;
3913       dup_node = *p_new;
3914
3915       /* Go to the left node, or up and to the right.  */
3916       if (node->left)
3917         {
3918           node = node->left;
3919           p_new = &dup_node->left;
3920         }
3921       else
3922         {
3923           const bin_tree_t *prev = NULL;
3924           while (node->right == prev || node->right == NULL)
3925             {
3926               prev = node;
3927               node = node->parent;
3928               dup_node = dup_node->parent;
3929               if (!node)
3930                 return dup_root;
3931             }
3932           node = node->right;
3933           p_new = &dup_node->right;
3934         }
3935     }
3936 }