git.gag.com Git - debian/tar/blob - gnu/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   5    This file is part of the GNU C Library.
   6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   7
   8    The GNU C Library is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public
  10    License as published by the Free Software Foundation; either
  11    version 3 of the License, or (at your option) any later version.
  12
  13    The GNU C Library is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public
  19    License along with the GNU C Library; if not, see
  20    <http://www.gnu.org/licenses/>.  */
  21
  22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  23                                           size_t length, reg_syntax_t syntax);
  24 static void re_compile_fastmap_iter (regex_t *bufp,
  25                                      const re_dfastate_t *init_state,
  26                                      char *fastmap);
  27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  52                                    unsigned int constraint);
  53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  55                                          Idx node, bool root);
  56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  57 static Idx fetch_number (re_string_t *input, re_token_t *token,
  58                          reg_syntax_t syntax);
  59 static int peek_token (re_token_t *token, re_string_t *input,
  60                         reg_syntax_t syntax) internal_function;
  61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  62                           reg_syntax_t syntax, reg_errcode_t *err);
  63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  64                                   re_token_t *token, reg_syntax_t syntax,
  65                                   Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  67                                  re_token_t *token, reg_syntax_t syntax,
  68                                  Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  70                                      re_token_t *token, reg_syntax_t syntax,
  71                                      Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   Idx nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  76                                  re_dfa_t *dfa, re_token_t *token,
  77                                  reg_syntax_t syntax, reg_errcode_t *err);
  78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  79                                       re_token_t *token, reg_syntax_t syntax,
  80                                       reg_errcode_t *err);
  81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  82                                             re_string_t *regexp,
  83                                             re_token_t *token, int token_len,
  84                                             re_dfa_t *dfa,
  85                                             reg_syntax_t syntax,
  86                                             bool accept_hyphen);
  87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  88                                           re_string_t *regexp,
  89                                           re_token_t *token);
  90 #ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  92                                         re_charset_t *mbcset,
  93                                         Idx *equiv_class_alloc,
  94                                         const unsigned char *name);
  95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  96                                       bitset_t sbcset,
  97                                       re_charset_t *mbcset,
  98                                       Idx *char_class_alloc,
  99                                       const char *class_name,
 100                                       reg_syntax_t syntax);
 101 #else  /* not RE_ENABLE_I18N */
 102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 103                                         const unsigned char *name);
 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 105                                       bitset_t sbcset,
 106                                       const char *class_name,
 107                                       reg_syntax_t syntax);
 108 #endif /* not RE_ENABLE_I18N */
 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 110                                        RE_TRANSLATE_TYPE trans,
 111                                        const char *class_name,
 112                                        const char *extra,
 113                                        bool non_match, reg_errcode_t *err);
 114 static bin_tree_t *create_tree (re_dfa_t *dfa,
 115                                 bin_tree_t *left, bin_tree_t *right,
 116                                 re_token_type_t type);
 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 118                                       bin_tree_t *left, bin_tree_t *right,
 119                                       const re_token_t *token);
 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 121 static void free_token (re_token_t *node);
 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 static const char __re_error_msgid[] =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 static const size_t __re_error_msgid_idx[] =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 212    are set in BUFP on entry.  */
 213
 214 #ifdef _LIBC
 215 const char *
 216 re_compile_pattern (pattern, length, bufp)
 217     const char *pattern;
 218     size_t length;
 219     struct re_pattern_buffer *bufp;
 220 #else /* size_t might promote */
 221 const char *
 222 re_compile_pattern (const char *pattern, size_t length,
 223                     struct re_pattern_buffer *bufp)
 224 #endif
 225 {
 226   reg_errcode_t ret;
 227
 228   /* And GNU code determines whether or not to get register information
 229      by passing null for the REGS argument to re_match, etc., not by
 230      setting no_sub, unless RE_NO_SUB is set.  */
 231   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 232
 233   /* Match anchors at newline.  */
 234   bufp->newline_anchor = 1;
 235
 236   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 237
 238   if (!ret)
 239     return NULL;
 240   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 241 }
 242 #ifdef _LIBC
 243 weak_alias (__re_compile_pattern, re_compile_pattern)
 244 #endif
 245
 246 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 247    also be assigned to arbitrarily: each pattern buffer stores its own
 248    syntax, so it can be changed between regex compilations.  */
 249 /* This has no initializer because initialized variables in Emacs
 250    become read-only after dumping.  */
 251 reg_syntax_t re_syntax_options;
 252
 253
 254 /* Specify the precise syntax of regexps for compilation.  This provides
 255    for compatibility for various utilities which historically have
 256    different, incompatible syntaxes.
 257
 258    The argument SYNTAX is a bit mask comprised of the various bits
 259    defined in regex.h.  We return the old syntax.  */
 260
 261 reg_syntax_t
 262 re_set_syntax (syntax)
 263     reg_syntax_t syntax;
 264 {
 265   reg_syntax_t ret = re_syntax_options;
 266
 267   re_syntax_options = syntax;
 268   return ret;
 269 }
 270 #ifdef _LIBC
 271 weak_alias (__re_set_syntax, re_set_syntax)
 272 #endif
 273
 274 int
 275 re_compile_fastmap (bufp)
 276     struct re_pattern_buffer *bufp;
 277 {
 278   re_dfa_t *dfa = bufp->buffer;
 279   char *fastmap = bufp->fastmap;
 280
 281   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 282   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 283   if (dfa->init_state != dfa->init_state_word)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 285   if (dfa->init_state != dfa->init_state_nl)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 287   if (dfa->init_state != dfa->init_state_begbuf)
 288     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 289   bufp->fastmap_accurate = 1;
 290   return 0;
 291 }
 292 #ifdef _LIBC
 293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 294 #endif
 295
 296 static inline void
 297 __attribute__ ((always_inline))
 298 re_set_fastmap (char *fastmap, bool icase, int ch)
 299 {
 300   fastmap[ch] = 1;
 301   if (icase)
 302     fastmap[tolower (ch)] = 1;
 303 }
 304
 305 /* Helper function for re_compile_fastmap.
 306    Compile fastmap for the initial_state INIT_STATE.  */
 307
 308 static void
 309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 310                          char *fastmap)
 311 {
 312   re_dfa_t *dfa = bufp->buffer;
 313   Idx node_cnt;
 314   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 315   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 316     {
 317       Idx node = init_state->nodes.elems[node_cnt];
 318       re_token_type_t type = dfa->nodes[node].type;
 319
 320       if (type == CHARACTER)
 321         {
 322           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 323 #ifdef RE_ENABLE_I18N
 324           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 325             {
 326               unsigned char buf[MB_LEN_MAX];
 327               unsigned char *p;
 328               wchar_t wc;
 329               mbstate_t state;
 330
 331               p = buf;
 332               *p++ = dfa->nodes[node].opr.c;
 333               while (++node < dfa->nodes_len
 334                      && dfa->nodes[node].type == CHARACTER
 335                      && dfa->nodes[node].mb_partial)
 336                 *p++ = dfa->nodes[node].opr.c;
 337               memset (&state, '\0', sizeof (state));
 338               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 339                              &state) == p - buf
 340                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 341                       != (size_t) -1))
 342                 re_set_fastmap (fastmap, false, buf[0]);
 343             }
 344 #endif
 345         }
 346       else if (type == SIMPLE_BRACKET)
 347         {
 348           int i, ch;
 349           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 350             {
 351               int j;
 352               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 353               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 354                 if (w & ((bitset_word_t) 1 << j))
 355                   re_set_fastmap (fastmap, icase, ch);
 356             }
 357         }
 358 #ifdef RE_ENABLE_I18N
 359       else if (type == COMPLEX_BRACKET)
 360         {
 361           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 362           Idx i;
 363
 364 # ifdef _LIBC
 365           /* See if we have to try all bytes which start multiple collation
 366              elements.
 367              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 368                   collation element, and don't catch 'b' since 'b' is
 369                   the only collation element which starts from 'b' (and
 370                   it is caught by SIMPLE_BRACKET).  */
 371               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 372                   && (cset->ncoll_syms || cset->nranges))
 373                 {
 374                   const int32_t *table = (const int32_t *)
 375                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 376                   for (i = 0; i < SBC_MAX; ++i)
 377                     if (table[i] < 0)
 378                       re_set_fastmap (fastmap, icase, i);
 379                 }
 380 # endif /* _LIBC */
 381
 382           /* See if we have to start the match at all multibyte characters,
 383              i.e. where we would not find an invalid sequence.  This only
 384              applies to multibyte character sets; for single byte character
 385              sets, the SIMPLE_BRACKET again suffices.  */
 386           if (dfa->mb_cur_max > 1
 387               && (cset->nchar_classes || cset->non_match || cset->nranges
 388 # ifdef _LIBC
 389                   || cset->nequiv_classes
 390 # endif /* _LIBC */
 391                  ))
 392             {
 393               unsigned char c = 0;
 394               do
 395                 {
 396                   mbstate_t mbs;
 397                   memset (&mbs, 0, sizeof (mbs));
 398                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 399                     re_set_fastmap (fastmap, false, (int) c);
 400                 }
 401               while (++c != 0);
 402             }
 403
 404           else
 405             {
 406               /* ... Else catch all bytes which can start the mbchars.  */
 407               for (i = 0; i < cset->nmbchars; ++i)
 408                 {
 409                   char buf[256];
 410                   mbstate_t state;
 411                   memset (&state, '\0', sizeof (state));
 412                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 413                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                     {
 416                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 417                           != (size_t) -1)
 418                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 419                     }
 420                 }
 421             }
 422         }
 423 #endif /* RE_ENABLE_I18N */
 424       else if (type == OP_PERIOD
 425 #ifdef RE_ENABLE_I18N
 426                || type == OP_UTF8_PERIOD
 427 #endif /* RE_ENABLE_I18N */
 428                || type == END_OF_RE)
 429         {
 430           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 431           if (type == END_OF_RE)
 432             bufp->can_be_null = 1;
 433           return;
 434         }
 435     }
 436 }
 437 \f
 438 /* Entry point for POSIX code.  */
 439 /* regcomp takes a regular expression as a string and compiles it.
 440
 441    PREG is a regex_t *.  We do not expect any fields to be initialized,
 442    since POSIX says we shouldn't.  Thus, we set
 443
 444      'buffer' to the compiled pattern;
 445      'used' to the length of the compiled pattern;
 446      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 447        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 448        RE_SYNTAX_POSIX_BASIC;
 449      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 450      'fastmap' to an allocated space for the fastmap;
 451      'fastmap_accurate' to zero;
 452      're_nsub' to the number of subexpressions in PATTERN.
 453
 454    PATTERN is the address of the pattern string.
 455
 456    CFLAGS is a series of bits which affect compilation.
 457
 458      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 459      use POSIX basic syntax.
 460
 461      If REG_NEWLINE is set, then . and [^...] don't match newline.
 462      Also, regexec will try a match beginning after every newline.
 463
 464      If REG_ICASE is set, then we considers upper- and lowercase
 465      versions of letters to be equivalent when matching.
 466
 467      If REG_NOSUB is set, then when PREG is passed to regexec, that
 468      routine will report only success or failure, and nothing about the
 469      registers.
 470
 471    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 472    the return codes and their meanings.)  */
 473
 474 int
 475 regcomp (preg, pattern, cflags)
 476     regex_t *_Restrict_ preg;
 477     const char *_Restrict_ pattern;
 478     int cflags;
 479 {
 480   reg_errcode_t ret;
 481   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 482                          : RE_SYNTAX_POSIX_BASIC);
 483
 484   preg->buffer = NULL;
 485   preg->allocated = 0;
 486   preg->used = 0;
 487
 488   /* Try to allocate space for the fastmap.  */
 489   preg->fastmap = re_malloc (char, SBC_MAX);
 490   if (BE (preg->fastmap == NULL, 0))
 491     return REG_ESPACE;
 492
 493   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 494
 495   /* If REG_NEWLINE is set, newlines are treated differently.  */
 496   if (cflags & REG_NEWLINE)
 497     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 498       syntax &= ~RE_DOT_NEWLINE;
 499       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 500       /* It also changes the matching behavior.  */
 501       preg->newline_anchor = 1;
 502     }
 503   else
 504     preg->newline_anchor = 0;
 505   preg->no_sub = !!(cflags & REG_NOSUB);
 506   preg->translate = NULL;
 507
 508   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 509
 510   /* POSIX doesn't distinguish between an unmatched open-group and an
 511      unmatched close-group: both are REG_EPAREN.  */
 512   if (ret == REG_ERPAREN)
 513     ret = REG_EPAREN;
 514
 515   /* We have already checked preg->fastmap != NULL.  */
 516   if (BE (ret == REG_NOERROR, 1))
 517     /* Compute the fastmap now, since regexec cannot modify the pattern
 518        buffer.  This function never fails in this implementation.  */
 519     (void) re_compile_fastmap (preg);
 520   else
 521     {
 522       /* Some error occurred while compiling the expression.  */
 523       re_free (preg->fastmap);
 524       preg->fastmap = NULL;
 525     }
 526
 527   return (int) ret;
 528 }
 529 #ifdef _LIBC
 530 weak_alias (__regcomp, regcomp)
 531 #endif
 532
 533 /* Returns a message corresponding to an error code, ERRCODE, returned
 534    from either regcomp or regexec.   We don't use PREG here.  */
 535
 536 #ifdef _LIBC
 537 size_t
 538 regerror (errcode, preg, errbuf, errbuf_size)
 539     int errcode;
 540     const regex_t *_Restrict_ preg;
 541     char *_Restrict_ errbuf;
 542     size_t errbuf_size;
 543 #else /* size_t might promote */
 544 size_t
 545 regerror (int errcode, const regex_t *_Restrict_ preg,
 546           char *_Restrict_ errbuf, size_t errbuf_size)
 547 #endif
 548 {
 549   const char *msg;
 550   size_t msg_size;
 551
 552   if (BE (errcode < 0
 553           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 554                                / sizeof (__re_error_msgid_idx[0])), 0))
 555     /* Only error codes returned by the rest of the code should be passed
 556        to this routine.  If we are given anything else, or if other regex
 557        code generates an invalid error code, then the program has a bug.
 558        Dump core so we can fix it.  */
 559     abort ();
 560
 561   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 562
 563   msg_size = strlen (msg) + 1; /* Includes the null.  */
 564
 565   if (BE (errbuf_size != 0, 1))
 566     {
 567       size_t cpy_size = msg_size;
 568       if (BE (msg_size > errbuf_size, 0))
 569         {
 570           cpy_size = errbuf_size - 1;
 571           errbuf[cpy_size] = '\0';
 572         }
 573       memcpy (errbuf, msg, cpy_size);
 574     }
 575
 576   return msg_size;
 577 }
 578 #ifdef _LIBC
 579 weak_alias (__regerror, regerror)
 580 #endif
 581
 582
 583 #ifdef RE_ENABLE_I18N
 584 /* This static array is used for the map to single-byte characters when
 585    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 586    it the same all the time.  UTF-8 is the preferred encoding so this is
 587    a worthwhile optimization.  */
 588 static const bitset_t utf8_sb_map =
 589 {
 590   /* Set the first 128 bits.  */
 591 # if defined __GNUC__ && !defined __STRICT_ANSI__
 592   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 593 # else
 594 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 595 #   error "bitset_word_t is narrower than 32 bits"
 596 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX, BITSET_WORD_MAX,
 600 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 601   BITSET_WORD_MAX,
 602 #  endif
 603   (BITSET_WORD_MAX
 604    >> (SBC_MAX % BITSET_WORD_BITS == 0
 605        ? 0
 606        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 607 # endif
 608 };
 609 #endif
 610
 611
 612 static void
 613 free_dfa_content (re_dfa_t *dfa)
 614 {
 615   Idx i, j;
 616
 617   if (dfa->nodes)
 618     for (i = 0; i < dfa->nodes_len; ++i)
 619       free_token (dfa->nodes + i);
 620   re_free (dfa->nexts);
 621   for (i = 0; i < dfa->nodes_len; ++i)
 622     {
 623       if (dfa->eclosures != NULL)
 624         re_node_set_free (dfa->eclosures + i);
 625       if (dfa->inveclosures != NULL)
 626         re_node_set_free (dfa->inveclosures + i);
 627       if (dfa->edests != NULL)
 628         re_node_set_free (dfa->edests + i);
 629     }
 630   re_free (dfa->edests);
 631   re_free (dfa->eclosures);
 632   re_free (dfa->inveclosures);
 633   re_free (dfa->nodes);
 634
 635   if (dfa->state_table)
 636     for (i = 0; i <= dfa->state_hash_mask; ++i)
 637       {
 638         struct re_state_table_entry *entry = dfa->state_table + i;
 639         for (j = 0; j < entry->num; ++j)
 640           {
 641             re_dfastate_t *state = entry->array[j];
 642             free_state (state);
 643           }
 644         re_free (entry->array);
 645       }
 646   re_free (dfa->state_table);
 647 #ifdef RE_ENABLE_I18N
 648   if (dfa->sb_char != utf8_sb_map)
 649     re_free (dfa->sb_char);
 650 #endif
 651   re_free (dfa->subexp_map);
 652 #ifdef DEBUG
 653   re_free (dfa->re_str);
 654 #endif
 655
 656   re_free (dfa);
 657 }
 658
 659
 660 /* Free dynamically allocated space used by PREG.  */
 661
 662 void
 663 regfree (preg)
 664     regex_t *preg;
 665 {
 666   re_dfa_t *dfa = preg->buffer;
 667   if (BE (dfa != NULL, 1))
 668     {
 669       lock_fini (dfa->lock);
 670       free_dfa_content (dfa);
 671     }
 672   preg->buffer = NULL;
 673   preg->allocated = 0;
 674
 675   re_free (preg->fastmap);
 676   preg->fastmap = NULL;
 677
 678   re_free (preg->translate);
 679   preg->translate = NULL;
 680 }
 681 #ifdef _LIBC
 682 weak_alias (__regfree, regfree)
 683 #endif
 684 \f
 685 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 686    them unless specifically requested.  */
 687
 688 #if defined _REGEX_RE_COMP || defined _LIBC
 689
 690 /* BSD has one and only one pattern buffer.  */
 691 static struct re_pattern_buffer re_comp_buf;
 692
 693 char *
 694 # ifdef _LIBC
 695 /* Make these definitions weak in libc, so POSIX programs can redefine
 696    these names if they don't use our functions, and still use
 697    regcomp/regexec above without link errors.  */
 698 weak_function
 699 # endif
 700 re_comp (s)
 701      const char *s;
 702 {
 703   reg_errcode_t ret;
 704   char *fastmap;
 705
 706   if (!s)
 707     {
 708       if (!re_comp_buf.buffer)
 709         return gettext ("No previous regular expression");
 710       return 0;
 711     }
 712
 713   if (re_comp_buf.buffer)
 714     {
 715       fastmap = re_comp_buf.fastmap;
 716       re_comp_buf.fastmap = NULL;
 717       __regfree (&re_comp_buf);
 718       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 719       re_comp_buf.fastmap = fastmap;
 720     }
 721
 722   if (re_comp_buf.fastmap == NULL)
 723     {
 724       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 725       if (re_comp_buf.fastmap == NULL)
 726         return (char *) gettext (__re_error_msgid
 727                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 728     }
 729
 730   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 731      don't need to initialize the pattern buffer fields which affect it.  */
 732
 733   /* Match anchors at newlines.  */
 734   re_comp_buf.newline_anchor = 1;
 735
 736   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 737
 738   if (!ret)
 739     return NULL;
 740
 741   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 742   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 743 }
 744
 745 #ifdef _LIBC
 746 libc_freeres_fn (free_mem)
 747 {
 748   __regfree (&re_comp_buf);
 749 }
 750 #endif
 751
 752 #endif /* _REGEX_RE_COMP */
 753 \f
 754 /* Internal entry point.
 755    Compile the regular expression PATTERN, whose length is LENGTH.
 756    SYNTAX indicate regular expression's syntax.  */
 757
 758 static reg_errcode_t
 759 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 760                      reg_syntax_t syntax)
 761 {
 762   reg_errcode_t err = REG_NOERROR;
 763   re_dfa_t *dfa;
 764   re_string_t regexp;
 765
 766   /* Initialize the pattern buffer.  */
 767   preg->fastmap_accurate = 0;
 768   preg->syntax = syntax;
 769   preg->not_bol = preg->not_eol = 0;
 770   preg->used = 0;
 771   preg->re_nsub = 0;
 772   preg->can_be_null = 0;
 773   preg->regs_allocated = REGS_UNALLOCATED;
 774
 775   /* Initialize the dfa.  */
 776   dfa = preg->buffer;
 777   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 778     {
 779       /* If zero allocated, but buffer is non-null, try to realloc
 780          enough space.  This loses if buffer's address is bogus, but
 781          that is the user's responsibility.  If ->buffer is NULL this
 782          is a simple allocation.  */
 783       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 784       if (dfa == NULL)
 785         return REG_ESPACE;
 786       preg->allocated = sizeof (re_dfa_t);
 787       preg->buffer = dfa;
 788     }
 789   preg->used = sizeof (re_dfa_t);
 790
 791   err = init_dfa (dfa, length);
 792   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 793     err = REG_ESPACE;
 794   if (BE (err != REG_NOERROR, 0))
 795     {
 796       free_dfa_content (dfa);
 797       preg->buffer = NULL;
 798       preg->allocated = 0;
 799       return err;
 800     }
 801 #ifdef DEBUG
 802   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 803   dfa->re_str = re_malloc (char, length + 1);
 804   strncpy (dfa->re_str, pattern, length + 1);
 805 #endif
 806
 807   err = re_string_construct (&regexp, pattern, length, preg->translate,
 808                              (syntax & RE_ICASE) != 0, dfa);
 809   if (BE (err != REG_NOERROR, 0))
 810     {
 811     re_compile_internal_free_return:
 812       free_workarea_compile (preg);
 813       re_string_destruct (&regexp);
 814       lock_fini (dfa->lock);
 815       free_dfa_content (dfa);
 816       preg->buffer = NULL;
 817       preg->allocated = 0;
 818       return err;
 819     }
 820
 821   /* Parse the regular expression, and build a structure tree.  */
 822   preg->re_nsub = 0;
 823   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 824   if (BE (dfa->str_tree == NULL, 0))
 825     goto re_compile_internal_free_return;
 826
 827   /* Analyze the tree and create the nfa.  */
 828   err = analyze (preg);
 829   if (BE (err != REG_NOERROR, 0))
 830     goto re_compile_internal_free_return;
 831
 832 #ifdef RE_ENABLE_I18N
 833   /* If possible, do searching in single byte encoding to speed things up.  */
 834   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 835     optimize_utf8 (dfa);
 836 #endif
 837
 838   /* Then create the initial state of the dfa.  */
 839   err = create_initial_state (dfa);
 840
 841   /* Release work areas.  */
 842   free_workarea_compile (preg);
 843   re_string_destruct (&regexp);
 844
 845   if (BE (err != REG_NOERROR, 0))
 846     {
 847       lock_fini (dfa->lock);
 848       free_dfa_content (dfa);
 849       preg->buffer = NULL;
 850       preg->allocated = 0;
 851     }
 852
 853   return err;
 854 }
 855
 856 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 857    as the initial length of some arrays.  */
 858
 859 static reg_errcode_t
 860 init_dfa (re_dfa_t *dfa, size_t pat_len)
 861 {
 862   __re_size_t table_size;
 863 #ifndef _LIBC
 864   const char *codeset_name;
 865 #endif
 866 #ifdef RE_ENABLE_I18N
 867   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 868 #else
 869   size_t max_i18n_object_size = 0;
 870 #endif
 871   size_t max_object_size =
 872     MAX (sizeof (struct re_state_table_entry),
 873          MAX (sizeof (re_token_t),
 874               MAX (sizeof (re_node_set),
 875                    MAX (sizeof (regmatch_t),
 876                         max_i18n_object_size))));
 877
 878   memset (dfa, '\0', sizeof (re_dfa_t));
 879
 880   /* Force allocation of str_tree_storage the first time.  */
 881   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 882
 883   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 884      calculation below, and for similar doubling calculations
 885      elsewhere.  And it's <= rather than <, because some of the
 886      doubling calculations add 1 afterwards.  */
 887   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 888     return REG_ESPACE;
 889
 890   dfa->nodes_alloc = pat_len + 1;
 891   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 892
 893   /*  table_size = 2 ^ ceil(log pat_len) */
 894   for (table_size = 1; ; table_size <<= 1)
 895     if (table_size > pat_len)
 896       break;
 897
 898   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 899   dfa->state_hash_mask = table_size - 1;
 900
 901   dfa->mb_cur_max = MB_CUR_MAX;
 902 #ifdef _LIBC
 903   if (dfa->mb_cur_max == 6
 904       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 905     dfa->is_utf8 = 1;
 906   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 907                        != 0);
 908 #else
 909   codeset_name = nl_langinfo (CODESET);
 910   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 911       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 912       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 913       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 914     dfa->is_utf8 = 1;
 915
 916   /* We check exhaustively in the loop below if this charset is a
 917      superset of ASCII.  */
 918   dfa->map_notascii = 0;
 919 #endif
 920
 921 #ifdef RE_ENABLE_I18N
 922   if (dfa->mb_cur_max > 1)
 923     {
 924       if (dfa->is_utf8)
 925         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 926       else
 927         {
 928           int i, j, ch;
 929
 930           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 931           if (BE (dfa->sb_char == NULL, 0))
 932             return REG_ESPACE;
 933
 934           /* Set the bits corresponding to single byte chars.  */
 935           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 936             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 937               {
 938                 wint_t wch = __btowc (ch);
 939                 if (wch != WEOF)
 940                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 941 # ifndef _LIBC
 942                 if (isascii (ch) && wch != ch)
 943                   dfa->map_notascii = 1;
 944 # endif
 945               }
 946         }
 947     }
 948 #endif
 949
 950   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 951     return REG_ESPACE;
 952   return REG_NOERROR;
 953 }
 954
 955 /* Initialize WORD_CHAR table, which indicate which character is
 956    "word".  In this case "word" means that it is the word construction
 957    character used by some operators like "\<", "\>", etc.  */
 958
 959 static void
 960 internal_function
 961 init_word_char (re_dfa_t *dfa)
 962 {
 963   int i = 0;
 964   int j;
 965   int ch = 0;
 966   dfa->word_ops_used = 1;
 967   if (BE (dfa->map_notascii == 0, 1))
 968     {
 969       bitset_word_t bits0 = 0x00000000;
 970       bitset_word_t bits1 = 0x03ff0000;
 971       bitset_word_t bits2 = 0x87fffffe;
 972       bitset_word_t bits3 = 0x07fffffe;
 973       if (BITSET_WORD_BITS == 64)
 974         {
 975           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 976           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 977           i = 2;
 978         }
 979       else if (BITSET_WORD_BITS == 32)
 980         {
 981           dfa->word_char[0] = bits0;
 982           dfa->word_char[1] = bits1;
 983           dfa->word_char[2] = bits2;
 984           dfa->word_char[3] = bits3;
 985           i = 4;
 986         }
 987       else
 988         goto general_case;
 989       ch = 128;
 990
 991       if (BE (dfa->is_utf8, 1))
 992         {
 993           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 994           return;
 995         }
 996     }
 997
 998  general_case:
 999   for (; i < BITSET_WORDS; ++i)
1000     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1001       if (isalnum (ch) || ch == '_')
1002         dfa->word_char[i] |= (bitset_word_t) 1 << j;
1003 }
1004
1005 /* Free the work area which are only used while compiling.  */
1006
1007 static void
1008 free_workarea_compile (regex_t *preg)
1009 {
1010   re_dfa_t *dfa = preg->buffer;
1011   bin_tree_storage_t *storage, *next;
1012   for (storage = dfa->str_tree_storage; storage; storage = next)
1013     {
1014       next = storage->next;
1015       re_free (storage);
1016     }
1017   dfa->str_tree_storage = NULL;
1018   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1019   dfa->str_tree = NULL;
1020   re_free (dfa->org_indices);
1021   dfa->org_indices = NULL;
1022 }
1023
1024 /* Create initial states for all contexts.  */
1025
1026 static reg_errcode_t
1027 create_initial_state (re_dfa_t *dfa)
1028 {
1029   Idx first, i;
1030   reg_errcode_t err;
1031   re_node_set init_nodes;
1032
1033   /* Initial states have the epsilon closure of the node which is
1034      the first node of the regular expression.  */
1035   first = dfa->str_tree->first->node_idx;
1036   dfa->init_node = first;
1037   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1038   if (BE (err != REG_NOERROR, 0))
1039     return err;
1040
1041   /* The back-references which are in initial states can epsilon transit,
1042      since in this case all of the subexpressions can be null.
1043      Then we add epsilon closures of the nodes which are the next nodes of
1044      the back-references.  */
1045   if (dfa->nbackref > 0)
1046     for (i = 0; i < init_nodes.nelem; ++i)
1047       {
1048         Idx node_idx = init_nodes.elems[i];
1049         re_token_type_t type = dfa->nodes[node_idx].type;
1050
1051         Idx clexp_idx;
1052         if (type != OP_BACK_REF)
1053           continue;
1054         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1055           {
1056             re_token_t *clexp_node;
1057             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1058             if (clexp_node->type == OP_CLOSE_SUBEXP
1059                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1060               break;
1061           }
1062         if (clexp_idx == init_nodes.nelem)
1063           continue;
1064
1065         if (type == OP_BACK_REF)
1066           {
1067             Idx dest_idx = dfa->edests[node_idx].elems[0];
1068             if (!re_node_set_contains (&init_nodes, dest_idx))
1069               {
1070                 reg_errcode_t merge_err
1071                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1072                 if (merge_err != REG_NOERROR)
1073                   return merge_err;
1074                 i = 0;
1075               }
1076           }
1077       }
1078
1079   /* It must be the first time to invoke acquire_state.  */
1080   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1081   /* We don't check ERR here, since the initial state must not be NULL.  */
1082   if (BE (dfa->init_state == NULL, 0))
1083     return err;
1084   if (dfa->init_state->has_constraint)
1085     {
1086       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1087                                                        CONTEXT_WORD);
1088       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1089                                                      CONTEXT_NEWLINE);
1090       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1091                                                          &init_nodes,
1092                                                          CONTEXT_NEWLINE
1093                                                          | CONTEXT_BEGBUF);
1094       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1095               || dfa->init_state_begbuf == NULL, 0))
1096         return err;
1097     }
1098   else
1099     dfa->init_state_word = dfa->init_state_nl
1100       = dfa->init_state_begbuf = dfa->init_state;
1101
1102   re_node_set_free (&init_nodes);
1103   return REG_NOERROR;
1104 }
1105 \f
1106 #ifdef RE_ENABLE_I18N
1107 /* If it is possible to do searching in single byte encoding instead of UTF-8
1108    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1109    DFA nodes where needed.  */
1110
1111 static void
1112 optimize_utf8 (re_dfa_t *dfa)
1113 {
1114   Idx node;
1115   int i;
1116   bool mb_chars = false;
1117   bool has_period = false;
1118
1119   for (node = 0; node < dfa->nodes_len; ++node)
1120     switch (dfa->nodes[node].type)
1121       {
1122       case CHARACTER:
1123         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1124           mb_chars = true;
1125         break;
1126       case ANCHOR:
1127         switch (dfa->nodes[node].opr.ctx_type)
1128           {
1129           case LINE_FIRST:
1130           case LINE_LAST:
1131           case BUF_FIRST:
1132           case BUF_LAST:
1133             break;
1134           default:
1135             /* Word anchors etc. cannot be handled.  It's okay to test
1136                opr.ctx_type since constraints (for all DFA nodes) are
1137                created by ORing one or more opr.ctx_type values.  */
1138             return;
1139           }
1140         break;
1141       case OP_PERIOD:
1142         has_period = true;
1143         break;
1144       case OP_BACK_REF:
1145       case OP_ALT:
1146       case END_OF_RE:
1147       case OP_DUP_ASTERISK:
1148       case OP_OPEN_SUBEXP:
1149       case OP_CLOSE_SUBEXP:
1150         break;
1151       case COMPLEX_BRACKET:
1152         return;
1153       case SIMPLE_BRACKET:
1154         /* Just double check.  */
1155         {
1156           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1157                         ? 0
1158                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1159           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1160             {
1161               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1162                 return;
1163               rshift = 0;
1164             }
1165         }
1166         break;
1167       default:
1168         abort ();
1169       }
1170
1171   if (mb_chars || has_period)
1172     for (node = 0; node < dfa->nodes_len; ++node)
1173       {
1174         if (dfa->nodes[node].type == CHARACTER
1175             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1176           dfa->nodes[node].mb_partial = 0;
1177         else if (dfa->nodes[node].type == OP_PERIOD)
1178           dfa->nodes[node].type = OP_UTF8_PERIOD;
1179       }
1180
1181   /* The search can be in single byte locale.  */
1182   dfa->mb_cur_max = 1;
1183   dfa->is_utf8 = 0;
1184   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1185 }
1186 #endif
1187 \f
1188 /* Analyze the structure tree, and calculate "first", "next", "edest",
1189    "eclosure", and "inveclosure".  */
1190
1191 static reg_errcode_t
1192 analyze (regex_t *preg)
1193 {
1194   re_dfa_t *dfa = preg->buffer;
1195   reg_errcode_t ret;
1196
1197   /* Allocate arrays.  */
1198   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1199   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1200   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1201   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1202   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1203           || dfa->eclosures == NULL, 0))
1204     return REG_ESPACE;
1205
1206   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1207   if (dfa->subexp_map != NULL)
1208     {
1209       Idx i;
1210       for (i = 0; i < preg->re_nsub; i++)
1211         dfa->subexp_map[i] = i;
1212       preorder (dfa->str_tree, optimize_subexps, dfa);
1213       for (i = 0; i < preg->re_nsub; i++)
1214         if (dfa->subexp_map[i] != i)
1215           break;
1216       if (i == preg->re_nsub)
1217         {
1218           free (dfa->subexp_map);
1219           dfa->subexp_map = NULL;
1220         }
1221     }
1222
1223   ret = postorder (dfa->str_tree, lower_subexps, preg);
1224   if (BE (ret != REG_NOERROR, 0))
1225     return ret;
1226   ret = postorder (dfa->str_tree, calc_first, dfa);
1227   if (BE (ret != REG_NOERROR, 0))
1228     return ret;
1229   preorder (dfa->str_tree, calc_next, dfa);
1230   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1231   if (BE (ret != REG_NOERROR, 0))
1232     return ret;
1233   ret = calc_eclosure (dfa);
1234   if (BE (ret != REG_NOERROR, 0))
1235     return ret;
1236
1237   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1238      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1239   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1240       || dfa->nbackref)
1241     {
1242       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1243       if (BE (dfa->inveclosures == NULL, 0))
1244         return REG_ESPACE;
1245       ret = calc_inveclosure (dfa);
1246     }
1247
1248   return ret;
1249 }
1250
1251 /* Our parse trees are very unbalanced, so we cannot use a stack to
1252    implement parse tree visits.  Instead, we use parent pointers and
1253    some hairy code in these two functions.  */
1254 static reg_errcode_t
1255 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1256            void *extra)
1257 {
1258   bin_tree_t *node, *prev;
1259
1260   for (node = root; ; )
1261     {
1262       /* Descend down the tree, preferably to the left (or to the right
1263          if that's the only child).  */
1264       while (node->left || node->right)
1265         if (node->left)
1266           node = node->left;
1267         else
1268           node = node->right;
1269
1270       do
1271         {
1272           reg_errcode_t err = fn (extra, node);
1273           if (BE (err != REG_NOERROR, 0))
1274             return err;
1275           if (node->parent == NULL)
1276             return REG_NOERROR;
1277           prev = node;
1278           node = node->parent;
1279         }
1280       /* Go up while we have a node that is reached from the right.  */
1281       while (node->right == prev || node->right == NULL);
1282       node = node->right;
1283     }
1284 }
1285
1286 static reg_errcode_t
1287 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1288           void *extra)
1289 {
1290   bin_tree_t *node;
1291
1292   for (node = root; ; )
1293     {
1294       reg_errcode_t err = fn (extra, node);
1295       if (BE (err != REG_NOERROR, 0))
1296         return err;
1297
1298       /* Go to the left node, or up and to the right.  */
1299       if (node->left)
1300         node = node->left;
1301       else
1302         {
1303           bin_tree_t *prev = NULL;
1304           while (node->right == prev || node->right == NULL)
1305             {
1306               prev = node;
1307               node = node->parent;
1308               if (!node)
1309                 return REG_NOERROR;
1310             }
1311           node = node->right;
1312         }
1313     }
1314 }
1315
1316 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1317    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1318    backreferences as well.  Requires a preorder visit.  */
1319 static reg_errcode_t
1320 optimize_subexps (void *extra, bin_tree_t *node)
1321 {
1322   re_dfa_t *dfa = (re_dfa_t *) extra;
1323
1324   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1325     {
1326       int idx = node->token.opr.idx;
1327       node->token.opr.idx = dfa->subexp_map[idx];
1328       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1329     }
1330
1331   else if (node->token.type == SUBEXP
1332            && node->left && node->left->token.type == SUBEXP)
1333     {
1334       Idx other_idx = node->left->token.opr.idx;
1335
1336       node->left = node->left->left;
1337       if (node->left)
1338         node->left->parent = node;
1339
1340       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1341       if (other_idx < BITSET_WORD_BITS)
1342         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1343     }
1344
1345   return REG_NOERROR;
1346 }
1347
1348 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1349    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1350 static reg_errcode_t
1351 lower_subexps (void *extra, bin_tree_t *node)
1352 {
1353   regex_t *preg = (regex_t *) extra;
1354   reg_errcode_t err = REG_NOERROR;
1355
1356   if (node->left && node->left->token.type == SUBEXP)
1357     {
1358       node->left = lower_subexp (&err, preg, node->left);
1359       if (node->left)
1360         node->left->parent = node;
1361     }
1362   if (node->right && node->right->token.type == SUBEXP)
1363     {
1364       node->right = lower_subexp (&err, preg, node->right);
1365       if (node->right)
1366         node->right->parent = node;
1367     }
1368
1369   return err;
1370 }
1371
1372 static bin_tree_t *
1373 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1374 {
1375   re_dfa_t *dfa = preg->buffer;
1376   bin_tree_t *body = node->left;
1377   bin_tree_t *op, *cls, *tree1, *tree;
1378
1379   if (preg->no_sub
1380       /* We do not optimize empty subexpressions, because otherwise we may
1381          have bad CONCAT nodes with NULL children.  This is obviously not
1382          very common, so we do not lose much.  An example that triggers
1383          this case is the sed "script" /\(\)/x.  */
1384       && node->left != NULL
1385       && (node->token.opr.idx >= BITSET_WORD_BITS
1386           || !(dfa->used_bkref_map
1387                & ((bitset_word_t) 1 << node->token.opr.idx))))
1388     return node->left;
1389
1390   /* Convert the SUBEXP node to the concatenation of an
1391      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1392   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1393   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1394   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1395   tree = create_tree (dfa, op, tree1, CONCAT);
1396   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1397     {
1398       *err = REG_ESPACE;
1399       return NULL;
1400     }
1401
1402   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1403   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1404   return tree;
1405 }
1406
1407 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1408    nodes.  Requires a postorder visit.  */
1409 static reg_errcode_t
1410 calc_first (void *extra, bin_tree_t *node)
1411 {
1412   re_dfa_t *dfa = (re_dfa_t *) extra;
1413   if (node->token.type == CONCAT)
1414     {
1415       node->first = node->left->first;
1416       node->node_idx = node->left->node_idx;
1417     }
1418   else
1419     {
1420       node->first = node;
1421       node->node_idx = re_dfa_add_node (dfa, node->token);
1422       if (BE (node->node_idx == REG_MISSING, 0))
1423         return REG_ESPACE;
1424       if (node->token.type == ANCHOR)
1425         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1426     }
1427   return REG_NOERROR;
1428 }
1429
1430 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1431 static reg_errcode_t
1432 calc_next (void *extra, bin_tree_t *node)
1433 {
1434   switch (node->token.type)
1435     {
1436     case OP_DUP_ASTERISK:
1437       node->left->next = node;
1438       break;
1439     case CONCAT:
1440       node->left->next = node->right->first;
1441       node->right->next = node->next;
1442       break;
1443     default:
1444       if (node->left)
1445         node->left->next = node->next;
1446       if (node->right)
1447         node->right->next = node->next;
1448       break;
1449     }
1450   return REG_NOERROR;
1451 }
1452
1453 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1454 static reg_errcode_t
1455 link_nfa_nodes (void *extra, bin_tree_t *node)
1456 {
1457   re_dfa_t *dfa = (re_dfa_t *) extra;
1458   Idx idx = node->node_idx;
1459   reg_errcode_t err = REG_NOERROR;
1460
1461   switch (node->token.type)
1462     {
1463     case CONCAT:
1464       break;
1465
1466     case END_OF_RE:
1467       assert (node->next == NULL);
1468       break;
1469
1470     case OP_DUP_ASTERISK:
1471     case OP_ALT:
1472       {
1473         Idx left, right;
1474         dfa->has_plural_match = 1;
1475         if (node->left != NULL)
1476           left = node->left->first->node_idx;
1477         else
1478           left = node->next->node_idx;
1479         if (node->right != NULL)
1480           right = node->right->first->node_idx;
1481         else
1482           right = node->next->node_idx;
1483         assert (REG_VALID_INDEX (left));
1484         assert (REG_VALID_INDEX (right));
1485         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1486       }
1487       break;
1488
1489     case ANCHOR:
1490     case OP_OPEN_SUBEXP:
1491     case OP_CLOSE_SUBEXP:
1492       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1493       break;
1494
1495     case OP_BACK_REF:
1496       dfa->nexts[idx] = node->next->node_idx;
1497       if (node->token.type == OP_BACK_REF)
1498         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1499       break;
1500
1501     default:
1502       assert (!IS_EPSILON_NODE (node->token.type));
1503       dfa->nexts[idx] = node->next->node_idx;
1504       break;
1505     }
1506
1507   return err;
1508 }
1509
1510 /* Duplicate the epsilon closure of the node ROOT_NODE.
1511    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1512    to their own constraint.  */
1513
1514 static reg_errcode_t
1515 internal_function
1516 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1517                         Idx root_node, unsigned int init_constraint)
1518 {
1519   Idx org_node, clone_node;
1520   bool ok;
1521   unsigned int constraint = init_constraint;
1522   for (org_node = top_org_node, clone_node = top_clone_node;;)
1523     {
1524       Idx org_dest, clone_dest;
1525       if (dfa->nodes[org_node].type == OP_BACK_REF)
1526         {
1527           /* If the back reference epsilon-transit, its destination must
1528              also have the constraint.  Then duplicate the epsilon closure
1529              of the destination of the back reference, and store it in
1530              edests of the back reference.  */
1531           org_dest = dfa->nexts[org_node];
1532           re_node_set_empty (dfa->edests + clone_node);
1533           clone_dest = duplicate_node (dfa, org_dest, constraint);
1534           if (BE (clone_dest == REG_MISSING, 0))
1535             return REG_ESPACE;
1536           dfa->nexts[clone_node] = dfa->nexts[org_node];
1537           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538           if (BE (! ok, 0))
1539             return REG_ESPACE;
1540         }
1541       else if (dfa->edests[org_node].nelem == 0)
1542         {
1543           /* In case of the node can't epsilon-transit, don't duplicate the
1544              destination and store the original destination as the
1545              destination of the node.  */
1546           dfa->nexts[clone_node] = dfa->nexts[org_node];
1547           break;
1548         }
1549       else if (dfa->edests[org_node].nelem == 1)
1550         {
1551           /* In case of the node can epsilon-transit, and it has only one
1552              destination.  */
1553           org_dest = dfa->edests[org_node].elems[0];
1554           re_node_set_empty (dfa->edests + clone_node);
1555           /* If the node is root_node itself, it means the epsilon closure
1556              has a loop.  Then tie it to the destination of the root_node.  */
1557           if (org_node == root_node && clone_node != org_node)
1558             {
1559               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1560               if (BE (! ok, 0))
1561                 return REG_ESPACE;
1562               break;
1563             }
1564           /* In case the node has another constraint, append it.  */
1565           constraint |= dfa->nodes[org_node].constraint;
1566           clone_dest = duplicate_node (dfa, org_dest, constraint);
1567           if (BE (clone_dest == REG_MISSING, 0))
1568             return REG_ESPACE;
1569           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570           if (BE (! ok, 0))
1571             return REG_ESPACE;
1572         }
1573       else /* dfa->edests[org_node].nelem == 2 */
1574         {
1575           /* In case of the node can epsilon-transit, and it has two
1576              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1577           org_dest = dfa->edests[org_node].elems[0];
1578           re_node_set_empty (dfa->edests + clone_node);
1579           /* Search for a duplicated node which satisfies the constraint.  */
1580           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1581           if (clone_dest == REG_MISSING)
1582             {
1583               /* There is no such duplicated node, create a new one.  */
1584               reg_errcode_t err;
1585               clone_dest = duplicate_node (dfa, org_dest, constraint);
1586               if (BE (clone_dest == REG_MISSING, 0))
1587                 return REG_ESPACE;
1588               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1589               if (BE (! ok, 0))
1590                 return REG_ESPACE;
1591               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1592                                             root_node, constraint);
1593               if (BE (err != REG_NOERROR, 0))
1594                 return err;
1595             }
1596           else
1597             {
1598               /* There is a duplicated node which satisfies the constraint,
1599                  use it to avoid infinite loop.  */
1600               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1601               if (BE (! ok, 0))
1602                 return REG_ESPACE;
1603             }
1604
1605           org_dest = dfa->edests[org_node].elems[1];
1606           clone_dest = duplicate_node (dfa, org_dest, constraint);
1607           if (BE (clone_dest == REG_MISSING, 0))
1608             return REG_ESPACE;
1609           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1610           if (BE (! ok, 0))
1611             return REG_ESPACE;
1612         }
1613       org_node = org_dest;
1614       clone_node = clone_dest;
1615     }
1616   return REG_NOERROR;
1617 }
1618
1619 /* Search for a node which is duplicated from the node ORG_NODE, and
1620    satisfies the constraint CONSTRAINT.  */
1621
1622 static Idx
1623 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1624                         unsigned int constraint)
1625 {
1626   Idx idx;
1627   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1628     {
1629       if (org_node == dfa->org_indices[idx]
1630           && constraint == dfa->nodes[idx].constraint)
1631         return idx; /* Found.  */
1632     }
1633   return REG_MISSING; /* Not found.  */
1634 }
1635
1636 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1637    Return the index of the new node, or REG_MISSING if insufficient storage is
1638    available.  */
1639
1640 static Idx
1641 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1642 {
1643   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1644   if (BE (dup_idx != REG_MISSING, 1))
1645     {
1646       dfa->nodes[dup_idx].constraint = constraint;
1647       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1648       dfa->nodes[dup_idx].duplicated = 1;
1649
1650       /* Store the index of the original node.  */
1651       dfa->org_indices[dup_idx] = org_idx;
1652     }
1653   return dup_idx;
1654 }
1655
1656 static reg_errcode_t
1657 calc_inveclosure (re_dfa_t *dfa)
1658 {
1659   Idx src, idx;
1660   bool ok;
1661   for (idx = 0; idx < dfa->nodes_len; ++idx)
1662     re_node_set_init_empty (dfa->inveclosures + idx);
1663
1664   for (src = 0; src < dfa->nodes_len; ++src)
1665     {
1666       Idx *elems = dfa->eclosures[src].elems;
1667       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1668         {
1669           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1670           if (BE (! ok, 0))
1671             return REG_ESPACE;
1672         }
1673     }
1674
1675   return REG_NOERROR;
1676 }
1677
1678 /* Calculate "eclosure" for all the node in DFA.  */
1679
1680 static reg_errcode_t
1681 calc_eclosure (re_dfa_t *dfa)
1682 {
1683   Idx node_idx;
1684   bool incomplete;
1685 #ifdef DEBUG
1686   assert (dfa->nodes_len > 0);
1687 #endif
1688   incomplete = false;
1689   /* For each nodes, calculate epsilon closure.  */
1690   for (node_idx = 0; ; ++node_idx)
1691     {
1692       reg_errcode_t err;
1693       re_node_set eclosure_elem;
1694       if (node_idx == dfa->nodes_len)
1695         {
1696           if (!incomplete)
1697             break;
1698           incomplete = false;
1699           node_idx = 0;
1700         }
1701
1702 #ifdef DEBUG
1703       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1704 #endif
1705
1706       /* If we have already calculated, skip it.  */
1707       if (dfa->eclosures[node_idx].nelem != 0)
1708         continue;
1709       /* Calculate epsilon closure of 'node_idx'.  */
1710       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1711       if (BE (err != REG_NOERROR, 0))
1712         return err;
1713
1714       if (dfa->eclosures[node_idx].nelem == 0)
1715         {
1716           incomplete = true;
1717           re_node_set_free (&eclosure_elem);
1718         }
1719     }
1720   return REG_NOERROR;
1721 }
1722
1723 /* Calculate epsilon closure of NODE.  */
1724
1725 static reg_errcode_t
1726 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1727 {
1728   reg_errcode_t err;
1729   Idx i;
1730   re_node_set eclosure;
1731   bool ok;
1732   bool incomplete = false;
1733   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1734   if (BE (err != REG_NOERROR, 0))
1735     return err;
1736
1737   /* This indicates that we are calculating this node now.
1738      We reference this value to avoid infinite loop.  */
1739   dfa->eclosures[node].nelem = REG_MISSING;
1740
1741   /* If the current node has constraints, duplicate all nodes
1742      since they must inherit the constraints.  */
1743   if (dfa->nodes[node].constraint
1744       && dfa->edests[node].nelem
1745       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1746     {
1747       err = duplicate_node_closure (dfa, node, node, node,
1748                                     dfa->nodes[node].constraint);
1749       if (BE (err != REG_NOERROR, 0))
1750         return err;
1751     }
1752
1753   /* Expand each epsilon destination nodes.  */
1754   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1755     for (i = 0; i < dfa->edests[node].nelem; ++i)
1756       {
1757         re_node_set eclosure_elem;
1758         Idx edest = dfa->edests[node].elems[i];
1759         /* If calculating the epsilon closure of 'edest' is in progress,
1760            return intermediate result.  */
1761         if (dfa->eclosures[edest].nelem == REG_MISSING)
1762           {
1763             incomplete = true;
1764             continue;
1765           }
1766         /* If we haven't calculated the epsilon closure of 'edest' yet,
1767            calculate now. Otherwise use calculated epsilon closure.  */
1768         if (dfa->eclosures[edest].nelem == 0)
1769           {
1770             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1771             if (BE (err != REG_NOERROR, 0))
1772               return err;
1773           }
1774         else
1775           eclosure_elem = dfa->eclosures[edest];
1776         /* Merge the epsilon closure of 'edest'.  */
1777         err = re_node_set_merge (&eclosure, &eclosure_elem);
1778         if (BE (err != REG_NOERROR, 0))
1779           return err;
1780         /* If the epsilon closure of 'edest' is incomplete,
1781            the epsilon closure of this node is also incomplete.  */
1782         if (dfa->eclosures[edest].nelem == 0)
1783           {
1784             incomplete = true;
1785             re_node_set_free (&eclosure_elem);
1786           }
1787       }
1788
1789   /* An epsilon closure includes itself.  */
1790   ok = re_node_set_insert (&eclosure, node);
1791   if (BE (! ok, 0))
1792     return REG_ESPACE;
1793   if (incomplete && !root)
1794     dfa->eclosures[node].nelem = 0;
1795   else
1796     dfa->eclosures[node] = eclosure;
1797   *new_set = eclosure;
1798   return REG_NOERROR;
1799 }
1800 \f
1801 /* Functions for token which are used in the parser.  */
1802
1803 /* Fetch a token from INPUT.
1804    We must not use this function inside bracket expressions.  */
1805
1806 static void
1807 internal_function
1808 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1809 {
1810   re_string_skip_bytes (input, peek_token (result, input, syntax));
1811 }
1812
1813 /* Peek a token from INPUT, and return the length of the token.
1814    We must not use this function inside bracket expressions.  */
1815
1816 static int
1817 internal_function
1818 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1819 {
1820   unsigned char c;
1821
1822   if (re_string_eoi (input))
1823     {
1824       token->type = END_OF_RE;
1825       return 0;
1826     }
1827
1828   c = re_string_peek_byte (input, 0);
1829   token->opr.c = c;
1830
1831   token->word_char = 0;
1832 #ifdef RE_ENABLE_I18N
1833   token->mb_partial = 0;
1834   if (input->mb_cur_max > 1 &&
1835       !re_string_first_byte (input, re_string_cur_idx (input)))
1836     {
1837       token->type = CHARACTER;
1838       token->mb_partial = 1;
1839       return 1;
1840     }
1841 #endif
1842   if (c == '\\')
1843     {
1844       unsigned char c2;
1845       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1846         {
1847           token->type = BACK_SLASH;
1848           return 1;
1849         }
1850
1851       c2 = re_string_peek_byte_case (input, 1);
1852       token->opr.c = c2;
1853       token->type = CHARACTER;
1854 #ifdef RE_ENABLE_I18N
1855       if (input->mb_cur_max > 1)
1856         {
1857           wint_t wc = re_string_wchar_at (input,
1858                                           re_string_cur_idx (input) + 1);
1859           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1860         }
1861       else
1862 #endif
1863         token->word_char = IS_WORD_CHAR (c2) != 0;
1864
1865       switch (c2)
1866         {
1867         case '|':
1868           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1869             token->type = OP_ALT;
1870           break;
1871         case '1': case '2': case '3': case '4': case '5':
1872         case '6': case '7': case '8': case '9':
1873           if (!(syntax & RE_NO_BK_REFS))
1874             {
1875               token->type = OP_BACK_REF;
1876               token->opr.idx = c2 - '1';
1877             }
1878           break;
1879         case '<':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = WORD_FIRST;
1884             }
1885           break;
1886         case '>':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = WORD_LAST;
1891             }
1892           break;
1893         case 'b':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             {
1896               token->type = ANCHOR;
1897               token->opr.ctx_type = WORD_DELIM;
1898             }
1899           break;
1900         case 'B':
1901           if (!(syntax & RE_NO_GNU_OPS))
1902             {
1903               token->type = ANCHOR;
1904               token->opr.ctx_type = NOT_WORD_DELIM;
1905             }
1906           break;
1907         case 'w':
1908           if (!(syntax & RE_NO_GNU_OPS))
1909             token->type = OP_WORD;
1910           break;
1911         case 'W':
1912           if (!(syntax & RE_NO_GNU_OPS))
1913             token->type = OP_NOTWORD;
1914           break;
1915         case 's':
1916           if (!(syntax & RE_NO_GNU_OPS))
1917             token->type = OP_SPACE;
1918           break;
1919         case 'S':
1920           if (!(syntax & RE_NO_GNU_OPS))
1921             token->type = OP_NOTSPACE;
1922           break;
1923         case '`':
1924           if (!(syntax & RE_NO_GNU_OPS))
1925             {
1926               token->type = ANCHOR;
1927               token->opr.ctx_type = BUF_FIRST;
1928             }
1929           break;
1930         case '\'':
1931           if (!(syntax & RE_NO_GNU_OPS))
1932             {
1933               token->type = ANCHOR;
1934               token->opr.ctx_type = BUF_LAST;
1935             }
1936           break;
1937         case '(':
1938           if (!(syntax & RE_NO_BK_PARENS))
1939             token->type = OP_OPEN_SUBEXP;
1940           break;
1941         case ')':
1942           if (!(syntax & RE_NO_BK_PARENS))
1943             token->type = OP_CLOSE_SUBEXP;
1944           break;
1945         case '+':
1946           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1947             token->type = OP_DUP_PLUS;
1948           break;
1949         case '?':
1950           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1951             token->type = OP_DUP_QUESTION;
1952           break;
1953         case '{':
1954           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1955             token->type = OP_OPEN_DUP_NUM;
1956           break;
1957         case '}':
1958           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1959             token->type = OP_CLOSE_DUP_NUM;
1960           break;
1961         default:
1962           break;
1963         }
1964       return 2;
1965     }
1966
1967   token->type = CHARACTER;
1968 #ifdef RE_ENABLE_I18N
1969   if (input->mb_cur_max > 1)
1970     {
1971       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1972       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1973     }
1974   else
1975 #endif
1976     token->word_char = IS_WORD_CHAR (token->opr.c);
1977
1978   switch (c)
1979     {
1980     case '\n':
1981       if (syntax & RE_NEWLINE_ALT)
1982         token->type = OP_ALT;
1983       break;
1984     case '|':
1985       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1986         token->type = OP_ALT;
1987       break;
1988     case '*':
1989       token->type = OP_DUP_ASTERISK;
1990       break;
1991     case '+':
1992       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1993         token->type = OP_DUP_PLUS;
1994       break;
1995     case '?':
1996       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1997         token->type = OP_DUP_QUESTION;
1998       break;
1999     case '{':
2000       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2001         token->type = OP_OPEN_DUP_NUM;
2002       break;
2003     case '}':
2004       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2005         token->type = OP_CLOSE_DUP_NUM;
2006       break;
2007     case '(':
2008       if (syntax & RE_NO_BK_PARENS)
2009         token->type = OP_OPEN_SUBEXP;
2010       break;
2011     case ')':
2012       if (syntax & RE_NO_BK_PARENS)
2013         token->type = OP_CLOSE_SUBEXP;
2014       break;
2015     case '[':
2016       token->type = OP_OPEN_BRACKET;
2017       break;
2018     case '.':
2019       token->type = OP_PERIOD;
2020       break;
2021     case '^':
2022       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2023           re_string_cur_idx (input) != 0)
2024         {
2025           char prev = re_string_peek_byte (input, -1);
2026           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2027             break;
2028         }
2029       token->type = ANCHOR;
2030       token->opr.ctx_type = LINE_FIRST;
2031       break;
2032     case '$':
2033       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2034           re_string_cur_idx (input) + 1 != re_string_length (input))
2035         {
2036           re_token_t next;
2037           re_string_skip_bytes (input, 1);
2038           peek_token (&next, input, syntax);
2039           re_string_skip_bytes (input, -1);
2040           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2041             break;
2042         }
2043       token->type = ANCHOR;
2044       token->opr.ctx_type = LINE_LAST;
2045       break;
2046     default:
2047       break;
2048     }
2049   return 1;
2050 }
2051
2052 /* Peek a token from INPUT, and return the length of the token.
2053    We must not use this function out of bracket expressions.  */
2054
2055 static int
2056 internal_function
2057 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2058 {
2059   unsigned char c;
2060   if (re_string_eoi (input))
2061     {
2062       token->type = END_OF_RE;
2063       return 0;
2064     }
2065   c = re_string_peek_byte (input, 0);
2066   token->opr.c = c;
2067
2068 #ifdef RE_ENABLE_I18N
2069   if (input->mb_cur_max > 1 &&
2070       !re_string_first_byte (input, re_string_cur_idx (input)))
2071     {
2072       token->type = CHARACTER;
2073       return 1;
2074     }
2075 #endif /* RE_ENABLE_I18N */
2076
2077   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2078       && re_string_cur_idx (input) + 1 < re_string_length (input))
2079     {
2080       /* In this case, '\' escape a character.  */
2081       unsigned char c2;
2082       re_string_skip_bytes (input, 1);
2083       c2 = re_string_peek_byte (input, 0);
2084       token->opr.c = c2;
2085       token->type = CHARACTER;
2086       return 1;
2087     }
2088   if (c == '[') /* '[' is a special char in a bracket exps.  */
2089     {
2090       unsigned char c2;
2091       int token_len;
2092       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2093         c2 = re_string_peek_byte (input, 1);
2094       else
2095         c2 = 0;
2096       token->opr.c = c2;
2097       token_len = 2;
2098       switch (c2)
2099         {
2100         case '.':
2101           token->type = OP_OPEN_COLL_ELEM;
2102           break;
2103         case '=':
2104           token->type = OP_OPEN_EQUIV_CLASS;
2105           break;
2106         case ':':
2107           if (syntax & RE_CHAR_CLASSES)
2108             {
2109               token->type = OP_OPEN_CHAR_CLASS;
2110               break;
2111             }
2112           /* else fall through.  */
2113         default:
2114           token->type = CHARACTER;
2115           token->opr.c = c;
2116           token_len = 1;
2117           break;
2118         }
2119       return token_len;
2120     }
2121   switch (c)
2122     {
2123     case '-':
2124       token->type = OP_CHARSET_RANGE;
2125       break;
2126     case ']':
2127       token->type = OP_CLOSE_BRACKET;
2128       break;
2129     case '^':
2130       token->type = OP_NON_MATCH_LIST;
2131       break;
2132     default:
2133       token->type = CHARACTER;
2134     }
2135   return 1;
2136 }
2137 \f
2138 /* Functions for parser.  */
2139
2140 /* Entry point of the parser.
2141    Parse the regular expression REGEXP and return the structure tree.
2142    If an error occurs, ERR is set by error code, and return NULL.
2143    This function build the following tree, from regular expression <reg_exp>:
2144            CAT
2145            / \
2146           /   \
2147    <reg_exp>  EOR
2148
2149    CAT means concatenation.
2150    EOR means end of regular expression.  */
2151
2152 static bin_tree_t *
2153 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2154        reg_errcode_t *err)
2155 {
2156   re_dfa_t *dfa = preg->buffer;
2157   bin_tree_t *tree, *eor, *root;
2158   re_token_t current_token;
2159   dfa->syntax = syntax;
2160   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2161   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2162   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2163     return NULL;
2164   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2165   if (tree != NULL)
2166     root = create_tree (dfa, tree, eor, CONCAT);
2167   else
2168     root = eor;
2169   if (BE (eor == NULL || root == NULL, 0))
2170     {
2171       *err = REG_ESPACE;
2172       return NULL;
2173     }
2174   return root;
2175 }
2176
2177 /* This function build the following tree, from regular expression
2178    <branch1>|<branch2>:
2179            ALT
2180            / \
2181           /   \
2182    <branch1> <branch2>
2183
2184    ALT means alternative, which represents the operator '|'.  */
2185
2186 static bin_tree_t *
2187 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2188                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2189 {
2190   re_dfa_t *dfa = preg->buffer;
2191   bin_tree_t *tree, *branch = NULL;
2192   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2193   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2194     return NULL;
2195
2196   while (token->type == OP_ALT)
2197     {
2198       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2199       if (token->type != OP_ALT && token->type != END_OF_RE
2200           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2201         {
2202           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2203           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2204             return NULL;
2205         }
2206       else
2207         branch = NULL;
2208       tree = create_tree (dfa, tree, branch, OP_ALT);
2209       if (BE (tree == NULL, 0))
2210         {
2211           *err = REG_ESPACE;
2212           return NULL;
2213         }
2214     }
2215   return tree;
2216 }
2217
2218 /* This function build the following tree, from regular expression
2219    <exp1><exp2>:
2220         CAT
2221         / \
2222        /   \
2223    <exp1> <exp2>
2224
2225    CAT means concatenation.  */
2226
2227 static bin_tree_t *
2228 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2229               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2230 {
2231   bin_tree_t *tree, *expr;
2232   re_dfa_t *dfa = preg->buffer;
2233   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2234   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2235     return NULL;
2236
2237   while (token->type != OP_ALT && token->type != END_OF_RE
2238          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2239     {
2240       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2241       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2242         {
2243           if (tree != NULL)
2244             postorder (tree, free_tree, NULL);
2245           return NULL;
2246         }
2247       if (tree != NULL && expr != NULL)
2248         {
2249           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2250           if (newtree == NULL)
2251             {
2252               postorder (expr, free_tree, NULL);
2253               postorder (tree, free_tree, NULL);
2254               *err = REG_ESPACE;
2255               return NULL;
2256             }
2257           tree = newtree;
2258         }
2259       else if (tree == NULL)
2260         tree = expr;
2261       /* Otherwise expr == NULL, we don't need to create new tree.  */
2262     }
2263   return tree;
2264 }
2265
2266 /* This function build the following tree, from regular expression a*:
2267          *
2268          |
2269          a
2270 */
2271
2272 static bin_tree_t *
2273 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2274                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2275 {
2276   re_dfa_t *dfa = preg->buffer;
2277   bin_tree_t *tree;
2278   switch (token->type)
2279     {
2280     case CHARACTER:
2281       tree = create_token_tree (dfa, NULL, NULL, token);
2282       if (BE (tree == NULL, 0))
2283         {
2284           *err = REG_ESPACE;
2285           return NULL;
2286         }
2287 #ifdef RE_ENABLE_I18N
2288       if (dfa->mb_cur_max > 1)
2289         {
2290           while (!re_string_eoi (regexp)
2291                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2292             {
2293               bin_tree_t *mbc_remain;
2294               fetch_token (token, regexp, syntax);
2295               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2296               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2297               if (BE (mbc_remain == NULL || tree == NULL, 0))
2298                 {
2299                   *err = REG_ESPACE;
2300                   return NULL;
2301                 }
2302             }
2303         }
2304 #endif
2305       break;
2306     case OP_OPEN_SUBEXP:
2307       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2308       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2309         return NULL;
2310       break;
2311     case OP_OPEN_BRACKET:
2312       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2313       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2314         return NULL;
2315       break;
2316     case OP_BACK_REF:
2317       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2318         {
2319           *err = REG_ESUBREG;
2320           return NULL;
2321         }
2322       dfa->used_bkref_map |= 1 << token->opr.idx;
2323       tree = create_token_tree (dfa, NULL, NULL, token);
2324       if (BE (tree == NULL, 0))
2325         {
2326           *err = REG_ESPACE;
2327           return NULL;
2328         }
2329       ++dfa->nbackref;
2330       dfa->has_mb_node = 1;
2331       break;
2332     case OP_OPEN_DUP_NUM:
2333       if (syntax & RE_CONTEXT_INVALID_DUP)
2334         {
2335           *err = REG_BADRPT;
2336           return NULL;
2337         }
2338       /* FALLTHROUGH */
2339     case OP_DUP_ASTERISK:
2340     case OP_DUP_PLUS:
2341     case OP_DUP_QUESTION:
2342       if (syntax & RE_CONTEXT_INVALID_OPS)
2343         {
2344           *err = REG_BADRPT;
2345           return NULL;
2346         }
2347       else if (syntax & RE_CONTEXT_INDEP_OPS)
2348         {
2349           fetch_token (token, regexp, syntax);
2350           return parse_expression (regexp, preg, token, syntax, nest, err);
2351         }
2352       /* else fall through  */
2353     case OP_CLOSE_SUBEXP:
2354       if ((token->type == OP_CLOSE_SUBEXP) &&
2355           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2356         {
2357           *err = REG_ERPAREN;
2358           return NULL;
2359         }
2360       /* else fall through  */
2361     case OP_CLOSE_DUP_NUM:
2362       /* We treat it as a normal character.  */
2363
2364       /* Then we can these characters as normal characters.  */
2365       token->type = CHARACTER;
2366       /* mb_partial and word_char bits should be initialized already
2367          by peek_token.  */
2368       tree = create_token_tree (dfa, NULL, NULL, token);
2369       if (BE (tree == NULL, 0))
2370         {
2371           *err = REG_ESPACE;
2372           return NULL;
2373         }
2374       break;
2375     case ANCHOR:
2376       if ((token->opr.ctx_type
2377            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2378           && dfa->word_ops_used == 0)
2379         init_word_char (dfa);
2380       if (token->opr.ctx_type == WORD_DELIM
2381           || token->opr.ctx_type == NOT_WORD_DELIM)
2382         {
2383           bin_tree_t *tree_first, *tree_last;
2384           if (token->opr.ctx_type == WORD_DELIM)
2385             {
2386               token->opr.ctx_type = WORD_FIRST;
2387               tree_first = create_token_tree (dfa, NULL, NULL, token);
2388               token->opr.ctx_type = WORD_LAST;
2389             }
2390           else
2391             {
2392               token->opr.ctx_type = INSIDE_WORD;
2393               tree_first = create_token_tree (dfa, NULL, NULL, token);
2394               token->opr.ctx_type = INSIDE_NOTWORD;
2395             }
2396           tree_last = create_token_tree (dfa, NULL, NULL, token);
2397           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2398           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2399             {
2400               *err = REG_ESPACE;
2401               return NULL;
2402             }
2403         }
2404       else
2405         {
2406           tree = create_token_tree (dfa, NULL, NULL, token);
2407           if (BE (tree == NULL, 0))
2408             {
2409               *err = REG_ESPACE;
2410               return NULL;
2411             }
2412         }
2413       /* We must return here, since ANCHORs can't be followed
2414          by repetition operators.
2415          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2416              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2417       fetch_token (token, regexp, syntax);
2418       return tree;
2419     case OP_PERIOD:
2420       tree = create_token_tree (dfa, NULL, NULL, token);
2421       if (BE (tree == NULL, 0))
2422         {
2423           *err = REG_ESPACE;
2424           return NULL;
2425         }
2426       if (dfa->mb_cur_max > 1)
2427         dfa->has_mb_node = 1;
2428       break;
2429     case OP_WORD:
2430     case OP_NOTWORD:
2431       tree = build_charclass_op (dfa, regexp->trans,
2432                                  "alnum",
2433                                  "_",
2434                                  token->type == OP_NOTWORD, err);
2435       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2436         return NULL;
2437       break;
2438     case OP_SPACE:
2439     case OP_NOTSPACE:
2440       tree = build_charclass_op (dfa, regexp->trans,
2441                                  "space",
2442                                  "",
2443                                  token->type == OP_NOTSPACE, err);
2444       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2445         return NULL;
2446       break;
2447     case OP_ALT:
2448     case END_OF_RE:
2449       return NULL;
2450     case BACK_SLASH:
2451       *err = REG_EESCAPE;
2452       return NULL;
2453     default:
2454       /* Must not happen?  */
2455 #ifdef DEBUG
2456       assert (0);
2457 #endif
2458       return NULL;
2459     }
2460   fetch_token (token, regexp, syntax);
2461
2462   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2463          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2464     {
2465       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2466       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2467         return NULL;
2468       /* In BRE consecutive duplications are not allowed.  */
2469       if ((syntax & RE_CONTEXT_INVALID_DUP)
2470           && (token->type == OP_DUP_ASTERISK
2471               || token->type == OP_OPEN_DUP_NUM))
2472         {
2473           *err = REG_BADRPT;
2474           return NULL;
2475         }
2476     }
2477
2478   return tree;
2479 }
2480
2481 /* This function build the following tree, from regular expression
2482    (<reg_exp>):
2483          SUBEXP
2484             |
2485         <reg_exp>
2486 */
2487
2488 static bin_tree_t *
2489 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2490                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2491 {
2492   re_dfa_t *dfa = preg->buffer;
2493   bin_tree_t *tree;
2494   size_t cur_nsub;
2495   cur_nsub = preg->re_nsub++;
2496
2497   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2498
2499   /* The subexpression may be a null string.  */
2500   if (token->type == OP_CLOSE_SUBEXP)
2501     tree = NULL;
2502   else
2503     {
2504       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2505       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2506         {
2507           if (tree != NULL)
2508             postorder (tree, free_tree, NULL);
2509           *err = REG_EPAREN;
2510         }
2511       if (BE (*err != REG_NOERROR, 0))
2512         return NULL;
2513     }
2514
2515   if (cur_nsub <= '9' - '1')
2516     dfa->completed_bkref_map |= 1 << cur_nsub;
2517
2518   tree = create_tree (dfa, tree, NULL, SUBEXP);
2519   if (BE (tree == NULL, 0))
2520     {
2521       *err = REG_ESPACE;
2522       return NULL;
2523     }
2524   tree->token.opr.idx = cur_nsub;
2525   return tree;
2526 }
2527
2528 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2529
2530 static bin_tree_t *
2531 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2532               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2533 {
2534   bin_tree_t *tree = NULL, *old_tree = NULL;
2535   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2536   re_token_t start_token = *token;
2537
2538   if (token->type == OP_OPEN_DUP_NUM)
2539     {
2540       end = 0;
2541       start = fetch_number (regexp, token, syntax);
2542       if (start == REG_MISSING)
2543         {
2544           if (token->type == CHARACTER && token->opr.c == ',')
2545             start = 0; /* We treat "{,m}" as "{0,m}".  */
2546           else
2547             {
2548               *err = REG_BADBR; /* <re>{} is invalid.  */
2549               return NULL;
2550             }
2551         }
2552       if (BE (start != REG_ERROR, 1))
2553         {
2554           /* We treat "{n}" as "{n,n}".  */
2555           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2556                  : ((token->type == CHARACTER && token->opr.c == ',')
2557                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2558         }
2559       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2560         {
2561           /* Invalid sequence.  */
2562           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2563             {
2564               if (token->type == END_OF_RE)
2565                 *err = REG_EBRACE;
2566               else
2567                 *err = REG_BADBR;
2568
2569               return NULL;
2570             }
2571
2572           /* If the syntax bit is set, rollback.  */
2573           re_string_set_index (regexp, start_idx);
2574           *token = start_token;
2575           token->type = CHARACTER;
2576           /* mb_partial and word_char bits should be already initialized by
2577              peek_token.  */
2578           return elem;
2579         }
2580
2581       if (BE ((end != REG_MISSING && start > end)
2582               || token->type != OP_CLOSE_DUP_NUM, 0))
2583         {
2584           /* First number greater than second.  */
2585           *err = REG_BADBR;
2586           return NULL;
2587         }
2588
2589       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2590         {
2591           *err = REG_ESIZE;
2592           return NULL;
2593         }
2594     }
2595   else
2596     {
2597       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2598       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2599     }
2600
2601   fetch_token (token, regexp, syntax);
2602
2603   if (BE (elem == NULL, 0))
2604     return NULL;
2605   if (BE (start == 0 && end == 0, 0))
2606     {
2607       postorder (elem, free_tree, NULL);
2608       return NULL;
2609     }
2610
2611   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2612   if (BE (start > 0, 0))
2613     {
2614       tree = elem;
2615       for (i = 2; i <= start; ++i)
2616         {
2617           elem = duplicate_tree (elem, dfa);
2618           tree = create_tree (dfa, tree, elem, CONCAT);
2619           if (BE (elem == NULL || tree == NULL, 0))
2620             goto parse_dup_op_espace;
2621         }
2622
2623       if (start == end)
2624         return tree;
2625
2626       /* Duplicate ELEM before it is marked optional.  */
2627       elem = duplicate_tree (elem, dfa);
2628       old_tree = tree;
2629     }
2630   else
2631     old_tree = NULL;
2632
2633   if (elem->token.type == SUBEXP)
2634     {
2635       uintptr_t subidx = elem->token.opr.idx;
2636       postorder (elem, mark_opt_subexp, (void *) subidx);
2637     }
2638
2639   tree = create_tree (dfa, elem, NULL,
2640                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2641   if (BE (tree == NULL, 0))
2642     goto parse_dup_op_espace;
2643
2644 /* From gnulib's "intprops.h":
2645    True if the arithmetic type T is signed.  */
2646 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2647
2648   /* This loop is actually executed only when end != REG_MISSING,
2649      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2650      already created the start+1-th copy.  */
2651   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2652     for (i = start + 2; i <= end; ++i)
2653       {
2654         elem = duplicate_tree (elem, dfa);
2655         tree = create_tree (dfa, tree, elem, CONCAT);
2656         if (BE (elem == NULL || tree == NULL, 0))
2657           goto parse_dup_op_espace;
2658
2659         tree = create_tree (dfa, tree, NULL, OP_ALT);
2660         if (BE (tree == NULL, 0))
2661           goto parse_dup_op_espace;
2662       }
2663
2664   if (old_tree)
2665     tree = create_tree (dfa, old_tree, tree, CONCAT);
2666
2667   return tree;
2668
2669  parse_dup_op_espace:
2670   *err = REG_ESPACE;
2671   return NULL;
2672 }
2673
2674 /* Size of the names for collating symbol/equivalence_class/character_class.
2675    I'm not sure, but maybe enough.  */
2676 #define BRACKET_NAME_BUF_SIZE 32
2677
2678 #ifndef _LIBC
2679   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2680      Build the range expression which starts from START_ELEM, and ends
2681      at END_ELEM.  The result are written to MBCSET and SBCSET.
2682      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2683      mbcset->range_ends, is a pointer argument since we may
2684      update it.  */
2685
2686 static reg_errcode_t
2687 internal_function
2688 # ifdef RE_ENABLE_I18N
2689 build_range_exp (const reg_syntax_t syntax,
2690                  bitset_t sbcset,
2691                  re_charset_t *mbcset,
2692                  Idx *range_alloc,
2693                  const bracket_elem_t *start_elem,
2694                  const bracket_elem_t *end_elem)
2695 # else /* not RE_ENABLE_I18N */
2696 build_range_exp (const reg_syntax_t syntax,
2697                  bitset_t sbcset,
2698                  const bracket_elem_t *start_elem,
2699                  const bracket_elem_t *end_elem)
2700 # endif /* not RE_ENABLE_I18N */
2701 {
2702   unsigned int start_ch, end_ch;
2703   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2704   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2705           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2706           0))
2707     return REG_ERANGE;
2708
2709   /* We can handle no multi character collating elements without libc
2710      support.  */
2711   if (BE ((start_elem->type == COLL_SYM
2712            && strlen ((char *) start_elem->opr.name) > 1)
2713           || (end_elem->type == COLL_SYM
2714               && strlen ((char *) end_elem->opr.name) > 1), 0))
2715     return REG_ECOLLATE;
2716
2717 # ifdef RE_ENABLE_I18N
2718   {
2719     wchar_t wc;
2720     wint_t start_wc;
2721     wint_t end_wc;
2722
2723     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2724                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2725                    : 0));
2726     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2727               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2728                  : 0));
2729     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2730                 ? __btowc (start_ch) : start_elem->opr.wch);
2731     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2732               ? __btowc (end_ch) : end_elem->opr.wch);
2733     if (start_wc == WEOF || end_wc == WEOF)
2734       return REG_ECOLLATE;
2735     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2736       return REG_ERANGE;
2737
2738     /* Got valid collation sequence values, add them as a new entry.
2739        However, for !_LIBC we have no collation elements: if the
2740        character set is single byte, the single byte character set
2741        that we build below suffices.  parse_bracket_exp passes
2742        no MBCSET if dfa->mb_cur_max == 1.  */
2743     if (mbcset)
2744       {
2745         /* Check the space of the arrays.  */
2746         if (BE (*range_alloc == mbcset->nranges, 0))
2747           {
2748             /* There is not enough space, need realloc.  */
2749             wchar_t *new_array_start, *new_array_end;
2750             Idx new_nranges;
2751
2752             /* +1 in case of mbcset->nranges is 0.  */
2753             new_nranges = 2 * mbcset->nranges + 1;
2754             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2755                are NULL if *range_alloc == 0.  */
2756             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2757                                           new_nranges);
2758             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2759                                         new_nranges);
2760
2761             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2762               return REG_ESPACE;
2763
2764             mbcset->range_starts = new_array_start;
2765             mbcset->range_ends = new_array_end;
2766             *range_alloc = new_nranges;
2767           }
2768
2769         mbcset->range_starts[mbcset->nranges] = start_wc;
2770         mbcset->range_ends[mbcset->nranges++] = end_wc;
2771       }
2772
2773     /* Build the table for single byte characters.  */
2774     for (wc = 0; wc < SBC_MAX; ++wc)
2775       {
2776         if (start_wc <= wc && wc <= end_wc)
2777           bitset_set (sbcset, wc);
2778       }
2779   }
2780 # else /* not RE_ENABLE_I18N */
2781   {
2782     unsigned int ch;
2783     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2784                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2785                    : 0));
2786     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2787               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2788                  : 0));
2789     if (start_ch > end_ch)
2790       return REG_ERANGE;
2791     /* Build the table for single byte characters.  */
2792     for (ch = 0; ch < SBC_MAX; ++ch)
2793       if (start_ch <= ch  && ch <= end_ch)
2794         bitset_set (sbcset, ch);
2795   }
2796 # endif /* not RE_ENABLE_I18N */
2797   return REG_NOERROR;
2798 }
2799 #endif /* not _LIBC */
2800
2801 #ifndef _LIBC
2802 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2803    Build the collating element which is represented by NAME.
2804    The result are written to MBCSET and SBCSET.
2805    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2806    pointer argument since we may update it.  */
2807
2808 static reg_errcode_t
2809 internal_function
2810 # ifdef RE_ENABLE_I18N
2811 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2812                         Idx *coll_sym_alloc, const unsigned char *name)
2813 # else /* not RE_ENABLE_I18N */
2814 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2815 # endif /* not RE_ENABLE_I18N */
2816 {
2817   size_t name_len = strlen ((const char *) name);
2818   if (BE (name_len != 1, 0))
2819     return REG_ECOLLATE;
2820   else
2821     {
2822       bitset_set (sbcset, name[0]);
2823       return REG_NOERROR;
2824     }
2825 }
2826 #endif /* not _LIBC */
2827
2828 /* This function parse bracket expression like "[abc]", "[a-c]",
2829    "[[.a-a.]]" etc.  */
2830
2831 static bin_tree_t *
2832 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2833                    reg_syntax_t syntax, reg_errcode_t *err)
2834 {
2835 #ifdef _LIBC
2836   const unsigned char *collseqmb;
2837   const char *collseqwc;
2838   uint32_t nrules;
2839   int32_t table_size;
2840   const int32_t *symb_table;
2841   const unsigned char *extra;
2842
2843   /* Local function for parse_bracket_exp used in _LIBC environment.
2844      Seek the collating symbol entry corresponding to NAME.
2845      Return the index of the symbol in the SYMB_TABLE,
2846      or -1 if not found.  */
2847
2848   auto inline int32_t
2849   __attribute__ ((always_inline))
2850   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2851     {
2852       int32_t elem;
2853
2854       for (elem = 0; elem < table_size; elem++)
2855         if (symb_table[2 * elem] != 0)
2856           {
2857             int32_t idx = symb_table[2 * elem + 1];
2858             /* Skip the name of collating element name.  */
2859             idx += 1 + extra[idx];
2860             if (/* Compare the length of the name.  */
2861                 name_len == extra[idx]
2862                 /* Compare the name.  */
2863                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2864               /* Yep, this is the entry.  */
2865               return elem;
2866           }
2867       return -1;
2868     }
2869
2870   /* Local function for parse_bracket_exp used in _LIBC environment.
2871      Look up the collation sequence value of BR_ELEM.
2872      Return the value if succeeded, UINT_MAX otherwise.  */
2873
2874   auto inline unsigned int
2875   __attribute__ ((always_inline))
2876   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2877     {
2878       if (br_elem->type == SB_CHAR)
2879         {
2880           /*
2881           if (MB_CUR_MAX == 1)
2882           */
2883           if (nrules == 0)
2884             return collseqmb[br_elem->opr.ch];
2885           else
2886             {
2887               wint_t wc = __btowc (br_elem->opr.ch);
2888               return __collseq_table_lookup (collseqwc, wc);
2889             }
2890         }
2891       else if (br_elem->type == MB_CHAR)
2892         {
2893           if (nrules != 0)
2894             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2895         }
2896       else if (br_elem->type == COLL_SYM)
2897         {
2898           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2899           if (nrules != 0)
2900             {
2901               int32_t elem, idx;
2902               elem = seek_collating_symbol_entry (br_elem->opr.name,
2903                                                   sym_name_len);
2904               if (elem != -1)
2905                 {
2906                   /* We found the entry.  */
2907                   idx = symb_table[2 * elem + 1];
2908                   /* Skip the name of collating element name.  */
2909                   idx += 1 + extra[idx];
2910                   /* Skip the byte sequence of the collating element.  */
2911                   idx += 1 + extra[idx];
2912                   /* Adjust for the alignment.  */
2913                   idx = (idx + 3) & ~3;
2914                   /* Skip the multibyte collation sequence value.  */
2915                   idx += sizeof (unsigned int);
2916                   /* Skip the wide char sequence of the collating element.  */
2917                   idx += sizeof (unsigned int) *
2918                     (1 + *(unsigned int *) (extra + idx));
2919                   /* Return the collation sequence value.  */
2920                   return *(unsigned int *) (extra + idx);
2921                 }
2922               else if (sym_name_len == 1)
2923                 {
2924                   /* No valid character.  Match it as a single byte
2925                      character.  */
2926                   return collseqmb[br_elem->opr.name[0]];
2927                 }
2928             }
2929           else if (sym_name_len == 1)
2930             return collseqmb[br_elem->opr.name[0]];
2931         }
2932       return UINT_MAX;
2933     }
2934
2935   /* Local function for parse_bracket_exp used in _LIBC environment.
2936      Build the range expression which starts from START_ELEM, and ends
2937      at END_ELEM.  The result are written to MBCSET and SBCSET.
2938      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2939      mbcset->range_ends, is a pointer argument since we may
2940      update it.  */
2941
2942   auto inline reg_errcode_t
2943   __attribute__ ((always_inline))
2944   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2945                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2946     {
2947       unsigned int ch;
2948       uint32_t start_collseq;
2949       uint32_t end_collseq;
2950
2951       /* Equivalence Classes and Character Classes can't be a range
2952          start/end.  */
2953       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2954               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2955               0))
2956         return REG_ERANGE;
2957
2958       /* FIXME: Implement rational ranges here, too.  */
2959       start_collseq = lookup_collation_sequence_value (start_elem);
2960       end_collseq = lookup_collation_sequence_value (end_elem);
2961       /* Check start/end collation sequence values.  */
2962       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2963         return REG_ECOLLATE;
2964       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2965         return REG_ERANGE;
2966
2967       /* Got valid collation sequence values, add them as a new entry.
2968          However, if we have no collation elements, and the character set
2969          is single byte, the single byte character set that we
2970          build below suffices. */
2971       if (nrules > 0 || dfa->mb_cur_max > 1)
2972         {
2973           /* Check the space of the arrays.  */
2974           if (BE (*range_alloc == mbcset->nranges, 0))
2975             {
2976               /* There is not enough space, need realloc.  */
2977               uint32_t *new_array_start;
2978               uint32_t *new_array_end;
2979               Idx new_nranges;
2980
2981               /* +1 in case of mbcset->nranges is 0.  */
2982               new_nranges = 2 * mbcset->nranges + 1;
2983               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2984                                             new_nranges);
2985               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2986                                           new_nranges);
2987
2988               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2989                 return REG_ESPACE;
2990
2991               mbcset->range_starts = new_array_start;
2992               mbcset->range_ends = new_array_end;
2993               *range_alloc = new_nranges;
2994             }
2995
2996           mbcset->range_starts[mbcset->nranges] = start_collseq;
2997           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2998         }
2999
3000       /* Build the table for single byte characters.  */
3001       for (ch = 0; ch < SBC_MAX; ch++)
3002         {
3003           uint32_t ch_collseq;
3004           /*
3005           if (MB_CUR_MAX == 1)
3006           */
3007           if (nrules == 0)
3008             ch_collseq = collseqmb[ch];
3009           else
3010             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3011           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3012             bitset_set (sbcset, ch);
3013         }
3014       return REG_NOERROR;
3015     }
3016
3017   /* Local function for parse_bracket_exp used in _LIBC environment.
3018      Build the collating element which is represented by NAME.
3019      The result are written to MBCSET and SBCSET.
3020      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3021      pointer argument since we may update it.  */
3022
3023   auto inline reg_errcode_t
3024   __attribute__ ((always_inline))
3025   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3026                           Idx *coll_sym_alloc, const unsigned char *name)
3027     {
3028       int32_t elem, idx;
3029       size_t name_len = strlen ((const char *) name);
3030       if (nrules != 0)
3031         {
3032           elem = seek_collating_symbol_entry (name, name_len);
3033           if (elem != -1)
3034             {
3035               /* We found the entry.  */
3036               idx = symb_table[2 * elem + 1];
3037               /* Skip the name of collating element name.  */
3038               idx += 1 + extra[idx];
3039             }
3040           else if (name_len == 1)
3041             {
3042               /* No valid character, treat it as a normal
3043                  character.  */
3044               bitset_set (sbcset, name[0]);
3045               return REG_NOERROR;
3046             }
3047           else
3048             return REG_ECOLLATE;
3049
3050           /* Got valid collation sequence, add it as a new entry.  */
3051           /* Check the space of the arrays.  */
3052           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3053             {
3054               /* Not enough, realloc it.  */
3055               /* +1 in case of mbcset->ncoll_syms is 0.  */
3056               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3057               /* Use realloc since mbcset->coll_syms is NULL
3058                  if *alloc == 0.  */
3059               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3060                                                    new_coll_sym_alloc);
3061               if (BE (new_coll_syms == NULL, 0))
3062                 return REG_ESPACE;
3063               mbcset->coll_syms = new_coll_syms;
3064               *coll_sym_alloc = new_coll_sym_alloc;
3065             }
3066           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3067           return REG_NOERROR;
3068         }
3069       else
3070         {
3071           if (BE (name_len != 1, 0))
3072             return REG_ECOLLATE;
3073           else
3074             {
3075               bitset_set (sbcset, name[0]);
3076               return REG_NOERROR;
3077             }
3078         }
3079     }
3080 #endif
3081
3082   re_token_t br_token;
3083   re_bitset_ptr_t sbcset;
3084 #ifdef RE_ENABLE_I18N
3085   re_charset_t *mbcset;
3086   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3087   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3088 #endif /* not RE_ENABLE_I18N */
3089   bool non_match = false;
3090   bin_tree_t *work_tree;
3091   int token_len;
3092   bool first_round = true;
3093 #ifdef _LIBC
3094   collseqmb = (const unsigned char *)
3095     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3096   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3097   if (nrules)
3098     {
3099       /*
3100       if (MB_CUR_MAX > 1)
3101       */
3102       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3103       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3104       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3105                                                   _NL_COLLATE_SYMB_TABLEMB);
3106       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3107                                                    _NL_COLLATE_SYMB_EXTRAMB);
3108     }
3109 #endif
3110   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3111 #ifdef RE_ENABLE_I18N
3112   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3113 #endif /* RE_ENABLE_I18N */
3114 #ifdef RE_ENABLE_I18N
3115   if (BE (sbcset == NULL || mbcset == NULL, 0))
3116 #else
3117   if (BE (sbcset == NULL, 0))
3118 #endif /* RE_ENABLE_I18N */
3119     {
3120       re_free (sbcset);
3121 #ifdef RE_ENABLE_I18N
3122       re_free (mbcset);
3123 #endif
3124       *err = REG_ESPACE;
3125       return NULL;
3126     }
3127
3128   token_len = peek_token_bracket (token, regexp, syntax);
3129   if (BE (token->type == END_OF_RE, 0))
3130     {
3131       *err = REG_BADPAT;
3132       goto parse_bracket_exp_free_return;
3133     }
3134   if (token->type == OP_NON_MATCH_LIST)
3135     {
3136 #ifdef RE_ENABLE_I18N
3137       mbcset->non_match = 1;
3138 #endif /* not RE_ENABLE_I18N */
3139       non_match = true;
3140       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3141         bitset_set (sbcset, '\n');
3142       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3143       token_len = peek_token_bracket (token, regexp, syntax);
3144       if (BE (token->type == END_OF_RE, 0))
3145         {
3146           *err = REG_BADPAT;
3147           goto parse_bracket_exp_free_return;
3148         }
3149     }
3150
3151   /* We treat the first ']' as a normal character.  */
3152   if (token->type == OP_CLOSE_BRACKET)
3153     token->type = CHARACTER;
3154
3155   while (1)
3156     {
3157       bracket_elem_t start_elem, end_elem;
3158       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3159       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3160       reg_errcode_t ret;
3161       int token_len2 = 0;
3162       bool is_range_exp = false;
3163       re_token_t token2;
3164
3165       start_elem.opr.name = start_name_buf;
3166       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3167                                    syntax, first_round);
3168       if (BE (ret != REG_NOERROR, 0))
3169         {
3170           *err = ret;
3171           goto parse_bracket_exp_free_return;
3172         }
3173       first_round = false;
3174
3175       /* Get information about the next token.  We need it in any case.  */
3176       token_len = peek_token_bracket (token, regexp, syntax);
3177
3178       /* Do not check for ranges if we know they are not allowed.  */
3179       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3180         {
3181           if (BE (token->type == END_OF_RE, 0))
3182             {
3183               *err = REG_EBRACK;
3184               goto parse_bracket_exp_free_return;
3185             }
3186           if (token->type == OP_CHARSET_RANGE)
3187             {
3188               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3189               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3190               if (BE (token2.type == END_OF_RE, 0))
3191                 {
3192                   *err = REG_EBRACK;
3193                   goto parse_bracket_exp_free_return;
3194                 }
3195               if (token2.type == OP_CLOSE_BRACKET)
3196                 {
3197                   /* We treat the last '-' as a normal character.  */
3198                   re_string_skip_bytes (regexp, -token_len);
3199                   token->type = CHARACTER;
3200                 }
3201               else
3202                 is_range_exp = true;
3203             }
3204         }
3205
3206       if (is_range_exp == true)
3207         {
3208           end_elem.opr.name = end_name_buf;
3209           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3210                                        dfa, syntax, true);
3211           if (BE (ret != REG_NOERROR, 0))
3212             {
3213               *err = ret;
3214               goto parse_bracket_exp_free_return;
3215             }
3216
3217           token_len = peek_token_bracket (token, regexp, syntax);
3218
3219 #ifdef _LIBC
3220           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3221                                   &start_elem, &end_elem);
3222 #else
3223 # ifdef RE_ENABLE_I18N
3224           *err = build_range_exp (syntax, sbcset,
3225                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3226                                   &range_alloc, &start_elem, &end_elem);
3227 # else
3228           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3229 # endif
3230 #endif /* RE_ENABLE_I18N */
3231           if (BE (*err != REG_NOERROR, 0))
3232             goto parse_bracket_exp_free_return;
3233         }
3234       else
3235         {
3236           switch (start_elem.type)
3237             {
3238             case SB_CHAR:
3239               bitset_set (sbcset, start_elem.opr.ch);
3240               break;
3241 #ifdef RE_ENABLE_I18N
3242             case MB_CHAR:
3243               /* Check whether the array has enough space.  */
3244               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3245                 {
3246                   wchar_t *new_mbchars;
3247                   /* Not enough, realloc it.  */
3248                   /* +1 in case of mbcset->nmbchars is 0.  */
3249                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3250                   /* Use realloc since array is NULL if *alloc == 0.  */
3251                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3252                                             mbchar_alloc);
3253                   if (BE (new_mbchars == NULL, 0))
3254                     goto parse_bracket_exp_espace;
3255                   mbcset->mbchars = new_mbchars;
3256                 }
3257               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3258               break;
3259 #endif /* RE_ENABLE_I18N */
3260             case EQUIV_CLASS:
3261               *err = build_equiv_class (sbcset,
3262 #ifdef RE_ENABLE_I18N
3263                                         mbcset, &equiv_class_alloc,
3264 #endif /* RE_ENABLE_I18N */
3265                                         start_elem.opr.name);
3266               if (BE (*err != REG_NOERROR, 0))
3267                 goto parse_bracket_exp_free_return;
3268               break;
3269             case COLL_SYM:
3270               *err = build_collating_symbol (sbcset,
3271 #ifdef RE_ENABLE_I18N
3272                                              mbcset, &coll_sym_alloc,
3273 #endif /* RE_ENABLE_I18N */
3274                                              start_elem.opr.name);
3275               if (BE (*err != REG_NOERROR, 0))
3276                 goto parse_bracket_exp_free_return;
3277               break;
3278             case CHAR_CLASS:
3279               *err = build_charclass (regexp->trans, sbcset,
3280 #ifdef RE_ENABLE_I18N
3281                                       mbcset, &char_class_alloc,
3282 #endif /* RE_ENABLE_I18N */
3283                                       (const char *) start_elem.opr.name,
3284                                       syntax);
3285               if (BE (*err != REG_NOERROR, 0))
3286                goto parse_bracket_exp_free_return;
3287               break;
3288             default:
3289               assert (0);
3290               break;
3291             }
3292         }
3293       if (BE (token->type == END_OF_RE, 0))
3294         {
3295           *err = REG_EBRACK;
3296           goto parse_bracket_exp_free_return;
3297         }
3298       if (token->type == OP_CLOSE_BRACKET)
3299         break;
3300     }
3301
3302   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3303
3304   /* If it is non-matching list.  */
3305   if (non_match)
3306     bitset_not (sbcset);
3307
3308 #ifdef RE_ENABLE_I18N
3309   /* Ensure only single byte characters are set.  */
3310   if (dfa->mb_cur_max > 1)
3311     bitset_mask (sbcset, dfa->sb_char);
3312
3313   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3314       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3315                                                      || mbcset->non_match)))
3316     {
3317       bin_tree_t *mbc_tree;
3318       int sbc_idx;
3319       /* Build a tree for complex bracket.  */
3320       dfa->has_mb_node = 1;
3321       br_token.type = COMPLEX_BRACKET;
3322       br_token.opr.mbcset = mbcset;
3323       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3324       if (BE (mbc_tree == NULL, 0))
3325         goto parse_bracket_exp_espace;
3326       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3327         if (sbcset[sbc_idx])
3328           break;
3329       /* If there are no bits set in sbcset, there is no point
3330          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3331       if (sbc_idx < BITSET_WORDS)
3332         {
3333           /* Build a tree for simple bracket.  */
3334           br_token.type = SIMPLE_BRACKET;
3335           br_token.opr.sbcset = sbcset;
3336           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3337           if (BE (work_tree == NULL, 0))
3338             goto parse_bracket_exp_espace;
3339
3340           /* Then join them by ALT node.  */
3341           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3342           if (BE (work_tree == NULL, 0))
3343             goto parse_bracket_exp_espace;
3344         }
3345       else
3346         {
3347           re_free (sbcset);
3348           work_tree = mbc_tree;
3349         }
3350     }
3351   else
3352 #endif /* not RE_ENABLE_I18N */
3353     {
3354 #ifdef RE_ENABLE_I18N
3355       free_charset (mbcset);
3356 #endif
3357       /* Build a tree for simple bracket.  */
3358       br_token.type = SIMPLE_BRACKET;
3359       br_token.opr.sbcset = sbcset;
3360       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3361       if (BE (work_tree == NULL, 0))
3362         goto parse_bracket_exp_espace;
3363     }
3364   return work_tree;
3365
3366  parse_bracket_exp_espace:
3367   *err = REG_ESPACE;
3368  parse_bracket_exp_free_return:
3369   re_free (sbcset);
3370 #ifdef RE_ENABLE_I18N
3371   free_charset (mbcset);
3372 #endif /* RE_ENABLE_I18N */
3373   return NULL;
3374 }
3375
3376 /* Parse an element in the bracket expression.  */
3377
3378 static reg_errcode_t
3379 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3380                        re_token_t *token, int token_len, re_dfa_t *dfa,
3381                        reg_syntax_t syntax, bool accept_hyphen)
3382 {
3383 #ifdef RE_ENABLE_I18N
3384   int cur_char_size;
3385   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3386   if (cur_char_size > 1)
3387     {
3388       elem->type = MB_CHAR;
3389       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3390       re_string_skip_bytes (regexp, cur_char_size);
3391       return REG_NOERROR;
3392     }
3393 #endif /* RE_ENABLE_I18N */
3394   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3395   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3396       || token->type == OP_OPEN_EQUIV_CLASS)
3397     return parse_bracket_symbol (elem, regexp, token);
3398   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3399     {
3400       /* A '-' must only appear as anything but a range indicator before
3401          the closing bracket.  Everything else is an error.  */
3402       re_token_t token2;
3403       (void) peek_token_bracket (&token2, regexp, syntax);
3404       if (token2.type != OP_CLOSE_BRACKET)
3405         /* The actual error value is not standardized since this whole
3406            case is undefined.  But ERANGE makes good sense.  */
3407         return REG_ERANGE;
3408     }
3409   elem->type = SB_CHAR;
3410   elem->opr.ch = token->opr.c;
3411   return REG_NOERROR;
3412 }
3413
3414 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3415    such as [:<character_class>:], [.<collating_element>.], and
3416    [=<equivalent_class>=].  */
3417
3418 static reg_errcode_t
3419 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3420                       re_token_t *token)
3421 {
3422   unsigned char ch, delim = token->opr.c;
3423   int i = 0;
3424   if (re_string_eoi(regexp))
3425     return REG_EBRACK;
3426   for (;; ++i)
3427     {
3428       if (i >= BRACKET_NAME_BUF_SIZE)
3429         return REG_EBRACK;
3430       if (token->type == OP_OPEN_CHAR_CLASS)
3431         ch = re_string_fetch_byte_case (regexp);
3432       else
3433         ch = re_string_fetch_byte (regexp);
3434       if (re_string_eoi(regexp))
3435         return REG_EBRACK;
3436       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3437         break;
3438       elem->opr.name[i] = ch;
3439     }
3440   re_string_skip_bytes (regexp, 1);
3441   elem->opr.name[i] = '\0';
3442   switch (token->type)
3443     {
3444     case OP_OPEN_COLL_ELEM:
3445       elem->type = COLL_SYM;
3446       break;
3447     case OP_OPEN_EQUIV_CLASS:
3448       elem->type = EQUIV_CLASS;
3449       break;
3450     case OP_OPEN_CHAR_CLASS:
3451       elem->type = CHAR_CLASS;
3452       break;
3453     default:
3454       break;
3455     }
3456   return REG_NOERROR;
3457 }
3458
3459   /* Helper function for parse_bracket_exp.
3460      Build the equivalence class which is represented by NAME.
3461      The result are written to MBCSET and SBCSET.
3462      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3463      is a pointer argument since we may update it.  */
3464
3465 static reg_errcode_t
3466 #ifdef RE_ENABLE_I18N
3467 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3468                    Idx *equiv_class_alloc, const unsigned char *name)
3469 #else /* not RE_ENABLE_I18N */
3470 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3471 #endif /* not RE_ENABLE_I18N */
3472 {
3473 #ifdef _LIBC
3474   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3475   if (nrules != 0)
3476     {
3477       const int32_t *table, *indirect;
3478       const unsigned char *weights, *extra, *cp;
3479       unsigned char char_buf[2];
3480       int32_t idx1, idx2;
3481       unsigned int ch;
3482       size_t len;
3483       /* This #include defines a local function!  */
3484 # include <locale/weight.h>
3485       /* Calculate the index for equivalence class.  */
3486       cp = name;
3487       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3488       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489                                                _NL_COLLATE_WEIGHTMB);
3490       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3491                                                    _NL_COLLATE_EXTRAMB);
3492       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3493                                                 _NL_COLLATE_INDIRECTMB);
3494       idx1 = findidx (&cp, -1);
3495       if (BE (idx1 == 0 || *cp != '\0', 0))
3496         /* This isn't a valid character.  */
3497         return REG_ECOLLATE;
3498
3499       /* Build single byte matching table for this equivalence class.  */
3500       len = weights[idx1 & 0xffffff];
3501       for (ch = 0; ch < SBC_MAX; ++ch)
3502         {
3503           char_buf[0] = ch;
3504           cp = char_buf;
3505           idx2 = findidx (&cp, 1);
3506 /*
3507           idx2 = table[ch];
3508 */
3509           if (idx2 == 0)
3510             /* This isn't a valid character.  */
3511             continue;
3512           /* Compare only if the length matches and the collation rule
3513              index is the same.  */
3514           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3515             {
3516               int cnt = 0;
3517
3518               while (cnt <= len &&
3519                      weights[(idx1 & 0xffffff) + 1 + cnt]
3520                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3521                 ++cnt;
3522
3523               if (cnt > len)
3524                 bitset_set (sbcset, ch);
3525             }
3526         }
3527       /* Check whether the array has enough space.  */
3528       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3529         {
3530           /* Not enough, realloc it.  */
3531           /* +1 in case of mbcset->nequiv_classes is 0.  */
3532           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3533           /* Use realloc since the array is NULL if *alloc == 0.  */
3534           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3535                                                    int32_t,
3536                                                    new_equiv_class_alloc);
3537           if (BE (new_equiv_classes == NULL, 0))
3538             return REG_ESPACE;
3539           mbcset->equiv_classes = new_equiv_classes;
3540           *equiv_class_alloc = new_equiv_class_alloc;
3541         }
3542       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3543     }
3544   else
3545 #endif /* _LIBC */
3546     {
3547       if (BE (strlen ((const char *) name) != 1, 0))
3548         return REG_ECOLLATE;
3549       bitset_set (sbcset, *name);
3550     }
3551   return REG_NOERROR;
3552 }
3553
3554   /* Helper function for parse_bracket_exp.
3555      Build the character class which is represented by NAME.
3556      The result are written to MBCSET and SBCSET.
3557      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3558      is a pointer argument since we may update it.  */
3559
3560 static reg_errcode_t
3561 #ifdef RE_ENABLE_I18N
3562 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3563                  re_charset_t *mbcset, Idx *char_class_alloc,
3564                  const char *class_name, reg_syntax_t syntax)
3565 #else /* not RE_ENABLE_I18N */
3566 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567                  const char *class_name, reg_syntax_t syntax)
3568 #endif /* not RE_ENABLE_I18N */
3569 {
3570   int i;
3571   const char *name = class_name;
3572
3573   /* In case of REG_ICASE "upper" and "lower" match the both of
3574      upper and lower cases.  */
3575   if ((syntax & RE_ICASE)
3576       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3577     name = "alpha";
3578
3579 #ifdef RE_ENABLE_I18N
3580   /* Check the space of the arrays.  */
3581   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3582     {
3583       /* Not enough, realloc it.  */
3584       /* +1 in case of mbcset->nchar_classes is 0.  */
3585       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3586       /* Use realloc since array is NULL if *alloc == 0.  */
3587       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3588                                                new_char_class_alloc);
3589       if (BE (new_char_classes == NULL, 0))
3590         return REG_ESPACE;
3591       mbcset->char_classes = new_char_classes;
3592       *char_class_alloc = new_char_class_alloc;
3593     }
3594   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3595 #endif /* RE_ENABLE_I18N */
3596
3597 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3598   do {                                          \
3599     if (BE (trans != NULL, 0))                  \
3600       {                                         \
3601         for (i = 0; i < SBC_MAX; ++i)           \
3602           if (ctype_func (i))                   \
3603             bitset_set (sbcset, trans[i]);      \
3604       }                                         \
3605     else                                        \
3606       {                                         \
3607         for (i = 0; i < SBC_MAX; ++i)           \
3608           if (ctype_func (i))                   \
3609             bitset_set (sbcset, i);             \
3610       }                                         \
3611   } while (0)
3612
3613   if (strcmp (name, "alnum") == 0)
3614     BUILD_CHARCLASS_LOOP (isalnum);
3615   else if (strcmp (name, "cntrl") == 0)
3616     BUILD_CHARCLASS_LOOP (iscntrl);
3617   else if (strcmp (name, "lower") == 0)
3618     BUILD_CHARCLASS_LOOP (islower);
3619   else if (strcmp (name, "space") == 0)
3620     BUILD_CHARCLASS_LOOP (isspace);
3621   else if (strcmp (name, "alpha") == 0)
3622     BUILD_CHARCLASS_LOOP (isalpha);
3623   else if (strcmp (name, "digit") == 0)
3624     BUILD_CHARCLASS_LOOP (isdigit);
3625   else if (strcmp (name, "print") == 0)
3626     BUILD_CHARCLASS_LOOP (isprint);
3627   else if (strcmp (name, "upper") == 0)
3628     BUILD_CHARCLASS_LOOP (isupper);
3629   else if (strcmp (name, "blank") == 0)
3630     BUILD_CHARCLASS_LOOP (isblank);
3631   else if (strcmp (name, "graph") == 0)
3632     BUILD_CHARCLASS_LOOP (isgraph);
3633   else if (strcmp (name, "punct") == 0)
3634     BUILD_CHARCLASS_LOOP (ispunct);
3635   else if (strcmp (name, "xdigit") == 0)
3636     BUILD_CHARCLASS_LOOP (isxdigit);
3637   else
3638     return REG_ECTYPE;
3639
3640   return REG_NOERROR;
3641 }
3642
3643 static bin_tree_t *
3644 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3645                     const char *class_name,
3646                     const char *extra, bool non_match,
3647                     reg_errcode_t *err)
3648 {
3649   re_bitset_ptr_t sbcset;
3650 #ifdef RE_ENABLE_I18N
3651   re_charset_t *mbcset;
3652   Idx alloc = 0;
3653 #endif /* not RE_ENABLE_I18N */
3654   reg_errcode_t ret;
3655   re_token_t br_token;
3656   bin_tree_t *tree;
3657
3658   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3659 #ifdef RE_ENABLE_I18N
3660   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3661 #endif /* RE_ENABLE_I18N */
3662
3663 #ifdef RE_ENABLE_I18N
3664   if (BE (sbcset == NULL || mbcset == NULL, 0))
3665 #else /* not RE_ENABLE_I18N */
3666   if (BE (sbcset == NULL, 0))
3667 #endif /* not RE_ENABLE_I18N */
3668     {
3669       *err = REG_ESPACE;
3670       return NULL;
3671     }
3672
3673   if (non_match)
3674     {
3675 #ifdef RE_ENABLE_I18N
3676       mbcset->non_match = 1;
3677 #endif /* not RE_ENABLE_I18N */
3678     }
3679
3680   /* We don't care the syntax in this case.  */
3681   ret = build_charclass (trans, sbcset,
3682 #ifdef RE_ENABLE_I18N
3683                          mbcset, &alloc,
3684 #endif /* RE_ENABLE_I18N */
3685                          class_name, 0);
3686
3687   if (BE (ret != REG_NOERROR, 0))
3688     {
3689       re_free (sbcset);
3690 #ifdef RE_ENABLE_I18N
3691       free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3693       *err = ret;
3694       return NULL;
3695     }
3696   /* \w match '_' also.  */
3697   for (; *extra; extra++)
3698     bitset_set (sbcset, *extra);
3699
3700   /* If it is non-matching list.  */
3701   if (non_match)
3702     bitset_not (sbcset);
3703
3704 #ifdef RE_ENABLE_I18N
3705   /* Ensure only single byte characters are set.  */
3706   if (dfa->mb_cur_max > 1)
3707     bitset_mask (sbcset, dfa->sb_char);
3708 #endif
3709
3710   /* Build a tree for simple bracket.  */
3711   br_token.type = SIMPLE_BRACKET;
3712   br_token.opr.sbcset = sbcset;
3713   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3714   if (BE (tree == NULL, 0))
3715     goto build_word_op_espace;
3716
3717 #ifdef RE_ENABLE_I18N
3718   if (dfa->mb_cur_max > 1)
3719     {
3720       bin_tree_t *mbc_tree;
3721       /* Build a tree for complex bracket.  */
3722       br_token.type = COMPLEX_BRACKET;
3723       br_token.opr.mbcset = mbcset;
3724       dfa->has_mb_node = 1;
3725       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726       if (BE (mbc_tree == NULL, 0))
3727         goto build_word_op_espace;
3728       /* Then join them by ALT node.  */
3729       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3730       if (BE (mbc_tree != NULL, 1))
3731         return tree;
3732     }
3733   else
3734     {
3735       free_charset (mbcset);
3736       return tree;
3737     }
3738 #else /* not RE_ENABLE_I18N */
3739   return tree;
3740 #endif /* not RE_ENABLE_I18N */
3741
3742  build_word_op_espace:
3743   re_free (sbcset);
3744 #ifdef RE_ENABLE_I18N
3745   free_charset (mbcset);
3746 #endif /* RE_ENABLE_I18N */
3747   *err = REG_ESPACE;
3748   return NULL;
3749 }
3750
3751 /* This is intended for the expressions like "a{1,3}".
3752    Fetch a number from 'input', and return the number.
3753    Return REG_MISSING if the number field is empty like "{,1}".
3754    Return RE_DUP_MAX + 1 if the number field is too large.
3755    Return REG_ERROR if an error occurred.  */
3756
3757 static Idx
3758 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3759 {
3760   Idx num = REG_MISSING;
3761   unsigned char c;
3762   while (1)
3763     {
3764       fetch_token (token, input, syntax);
3765       c = token->opr.c;
3766       if (BE (token->type == END_OF_RE, 0))
3767         return REG_ERROR;
3768       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769         break;
3770       num = ((token->type != CHARACTER || c < '0' || '9' < c
3771               || num == REG_ERROR)
3772              ? REG_ERROR
3773              : num == REG_MISSING
3774              ? c - '0'
3775              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3776     }
3777   return num;
3778 }
3779 \f
3780 #ifdef RE_ENABLE_I18N
3781 static void
3782 free_charset (re_charset_t *cset)
3783 {
3784   re_free (cset->mbchars);
3785 # ifdef _LIBC
3786   re_free (cset->coll_syms);
3787   re_free (cset->equiv_classes);
3788   re_free (cset->range_starts);
3789   re_free (cset->range_ends);
3790 # endif
3791   re_free (cset->char_classes);
3792   re_free (cset);
3793 }
3794 #endif /* RE_ENABLE_I18N */
3795 \f
3796 /* Functions for binary tree operation.  */
3797
3798 /* Create a tree node.  */
3799
3800 static bin_tree_t *
3801 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3802              re_token_type_t type)
3803 {
3804   re_token_t t;
3805   t.type = type;
3806   return create_token_tree (dfa, left, right, &t);
3807 }
3808
3809 static bin_tree_t *
3810 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3811                    const re_token_t *token)
3812 {
3813   bin_tree_t *tree;
3814   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3815     {
3816       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3817
3818       if (storage == NULL)
3819         return NULL;
3820       storage->next = dfa->str_tree_storage;
3821       dfa->str_tree_storage = storage;
3822       dfa->str_tree_storage_idx = 0;
3823     }
3824   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3825
3826   tree->parent = NULL;
3827   tree->left = left;
3828   tree->right = right;
3829   tree->token = *token;
3830   tree->token.duplicated = 0;
3831   tree->token.opt_subexp = 0;
3832   tree->first = NULL;
3833   tree->next = NULL;
3834   tree->node_idx = REG_MISSING;
3835
3836   if (left != NULL)
3837     left->parent = tree;
3838   if (right != NULL)
3839     right->parent = tree;
3840   return tree;
3841 }
3842
3843 /* Mark the tree SRC as an optional subexpression.
3844    To be called from preorder or postorder.  */
3845
3846 static reg_errcode_t
3847 mark_opt_subexp (void *extra, bin_tree_t *node)
3848 {
3849   Idx idx = (uintptr_t) extra;
3850   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3851     node->token.opt_subexp = 1;
3852
3853   return REG_NOERROR;
3854 }
3855
3856 /* Free the allocated memory inside NODE. */
3857
3858 static void
3859 free_token (re_token_t *node)
3860 {
3861 #ifdef RE_ENABLE_I18N
3862   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3863     free_charset (node->opr.mbcset);
3864   else
3865 #endif /* RE_ENABLE_I18N */
3866     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3867       re_free (node->opr.sbcset);
3868 }
3869
3870 /* Worker function for tree walking.  Free the allocated memory inside NODE
3871    and its children. */
3872
3873 static reg_errcode_t
3874 free_tree (void *extra, bin_tree_t *node)
3875 {
3876   free_token (&node->token);
3877   return REG_NOERROR;
3878 }
3879
3880
3881 /* Duplicate the node SRC, and return new node.  This is a preorder
3882    visit similar to the one implemented by the generic visitor, but
3883    we need more infrastructure to maintain two parallel trees --- so,
3884    it's easier to duplicate.  */
3885
3886 static bin_tree_t *
3887 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3888 {
3889   const bin_tree_t *node;
3890   bin_tree_t *dup_root;
3891   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3892
3893   for (node = root; ; )
3894     {
3895       /* Create a new tree and link it back to the current parent.  */
3896       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3897       if (*p_new == NULL)
3898         return NULL;
3899       (*p_new)->parent = dup_node;
3900       (*p_new)->token.duplicated = 1;
3901       dup_node = *p_new;
3902
3903       /* Go to the left node, or up and to the right.  */
3904       if (node->left)
3905         {
3906           node = node->left;
3907           p_new = &dup_node->left;
3908         }
3909       else
3910         {
3911           const bin_tree_t *prev = NULL;
3912           while (node->right == prev || node->right == NULL)
3913             {
3914               prev = node;
3915               node = node->parent;
3916               dup_node = dup_node->parent;
3917               if (!node)
3918                 return dup_root;
3919             }
3920           node = node->right;
3921           p_new = &dup_node->right;
3922         }
3923     }
3924 }