git.gag.com Git - debian/tar/blob - gnu/regex_internal.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002-2011 Free Software Foundation, Inc.
   5    This file is part of the GNU C Library.
   6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   7
   8    This program is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License along
  19    with this program; if not, write to the Free Software Foundation,
  20    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  21
  22 static void re_string_construct_common (const char *str, Idx len,
  23                                         re_string_t *pstr,
  24                                         RE_TRANSLATE_TYPE trans, bool icase,
  25                                         const re_dfa_t *dfa) internal_function;
  26 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  27                                           const re_node_set *nodes,
  28                                           re_hashval_t hash) internal_function;
  29 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  30                                           const re_node_set *nodes,
  31                                           unsigned int context,
  32                                           re_hashval_t hash) internal_function;
  33 \f
  34 /* Functions for string operation.  */
  35
  36 /* This function allocate the buffers.  It is necessary to call
  37    re_string_reconstruct before using the object.  */
  38
  39 static reg_errcode_t
  40 internal_function __attribute_warn_unused_result__
  41 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  42                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  43 {
  44   reg_errcode_t ret;
  45   Idx init_buf_len;
  46
  47   /* Ensure at least one character fits into the buffers.  */
  48   if (init_len < dfa->mb_cur_max)
  49     init_len = dfa->mb_cur_max;
  50   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  51   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  52
  53   ret = re_string_realloc_buffers (pstr, init_buf_len);
  54   if (BE (ret != REG_NOERROR, 0))
  55     return ret;
  56
  57   pstr->word_char = dfa->word_char;
  58   pstr->word_ops_used = dfa->word_ops_used;
  59   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  60   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  61   pstr->valid_raw_len = pstr->valid_len;
  62   return REG_NOERROR;
  63 }
  64
  65 /* This function allocate the buffers, and initialize them.  */
  66
  67 static reg_errcode_t
  68 internal_function __attribute_warn_unused_result__
  69 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  70                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  71 {
  72   reg_errcode_t ret;
  73   memset (pstr, '\0', sizeof (re_string_t));
  74   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  75
  76   if (len > 0)
  77     {
  78       ret = re_string_realloc_buffers (pstr, len + 1);
  79       if (BE (ret != REG_NOERROR, 0))
  80         return ret;
  81     }
  82   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  83
  84   if (icase)
  85     {
  86 #ifdef RE_ENABLE_I18N
  87       if (dfa->mb_cur_max > 1)
  88         {
  89           while (1)
  90             {
  91               ret = build_wcs_upper_buffer (pstr);
  92               if (BE (ret != REG_NOERROR, 0))
  93                 return ret;
  94               if (pstr->valid_raw_len >= len)
  95                 break;
  96               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  97                 break;
  98               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  99               if (BE (ret != REG_NOERROR, 0))
 100                 return ret;
 101             }
 102         }
 103       else
 104 #endif /* RE_ENABLE_I18N  */
 105         build_upper_buffer (pstr);
 106     }
 107   else
 108     {
 109 #ifdef RE_ENABLE_I18N
 110       if (dfa->mb_cur_max > 1)
 111         build_wcs_buffer (pstr);
 112       else
 113 #endif /* RE_ENABLE_I18N  */
 114         {
 115           if (trans != NULL)
 116             re_string_translate_buffer (pstr);
 117           else
 118             {
 119               pstr->valid_len = pstr->bufs_len;
 120               pstr->valid_raw_len = pstr->bufs_len;
 121             }
 122         }
 123     }
 124
 125   return REG_NOERROR;
 126 }
 127
 128 /* Helper functions for re_string_allocate, and re_string_construct.  */
 129
 130 static reg_errcode_t
 131 internal_function __attribute_warn_unused_result__
 132 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 133 {
 134 #ifdef RE_ENABLE_I18N
 135   if (pstr->mb_cur_max > 1)
 136     {
 137       wint_t *new_wcs;
 138
 139       /* Avoid overflow.  */
 140       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 141       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 142         return REG_ESPACE;
 143
 144       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 145       if (BE (new_wcs == NULL, 0))
 146         return REG_ESPACE;
 147       pstr->wcs = new_wcs;
 148       if (pstr->offsets != NULL)
 149         {
 150           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 151           if (BE (new_offsets == NULL, 0))
 152             return REG_ESPACE;
 153           pstr->offsets = new_offsets;
 154         }
 155     }
 156 #endif /* RE_ENABLE_I18N  */
 157   if (pstr->mbs_allocated)
 158     {
 159       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 160                                            new_buf_len);
 161       if (BE (new_mbs == NULL, 0))
 162         return REG_ESPACE;
 163       pstr->mbs = new_mbs;
 164     }
 165   pstr->bufs_len = new_buf_len;
 166   return REG_NOERROR;
 167 }
 168
 169
 170 static void
 171 internal_function
 172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 173                             RE_TRANSLATE_TYPE trans, bool icase,
 174                             const re_dfa_t *dfa)
 175 {
 176   pstr->raw_mbs = (const unsigned char *) str;
 177   pstr->len = len;
 178   pstr->raw_len = len;
 179   pstr->trans = trans;
 180   pstr->icase = icase;
 181   pstr->mbs_allocated = (trans != NULL || icase);
 182   pstr->mb_cur_max = dfa->mb_cur_max;
 183   pstr->is_utf8 = dfa->is_utf8;
 184   pstr->map_notascii = dfa->map_notascii;
 185   pstr->stop = pstr->len;
 186   pstr->raw_stop = pstr->stop;
 187 }
 188
 189 #ifdef RE_ENABLE_I18N
 190
 191 /* Build wide character buffer PSTR->WCS.
 192    If the byte sequence of the string are:
 193      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 194    Then wide character buffer will be:
 195      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 196    We use WEOF for padding, they indicate that the position isn't
 197    a first byte of a multibyte character.
 198
 199    Note that this function assumes PSTR->VALID_LEN elements are already
 200    built and starts from PSTR->VALID_LEN.  */
 201
 202 static void
 203 internal_function
 204 build_wcs_buffer (re_string_t *pstr)
 205 {
 206 #ifdef _LIBC
 207   unsigned char buf[MB_LEN_MAX];
 208   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 209 #else
 210   unsigned char buf[64];
 211 #endif
 212   mbstate_t prev_st;
 213   Idx byte_idx, end_idx, remain_len;
 214   size_t mbclen;
 215
 216   /* Build the buffers from pstr->valid_len to either pstr->len or
 217      pstr->bufs_len.  */
 218   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 219   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 220     {
 221       wchar_t wc;
 222       const char *p;
 223
 224       remain_len = end_idx - byte_idx;
 225       prev_st = pstr->cur_state;
 226       /* Apply the translation if we need.  */
 227       if (BE (pstr->trans != NULL, 0))
 228         {
 229           int i, ch;
 230
 231           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 232             {
 233               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 234               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 235             }
 236           p = (const char *) buf;
 237         }
 238       else
 239         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 240       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 241       if (BE (mbclen == (size_t) -2, 0))
 242         {
 243           /* The buffer doesn't have enough space, finish to build.  */
 244           pstr->cur_state = prev_st;
 245           break;
 246         }
 247       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 248         {
 249           /* We treat these cases as a singlebyte character.  */
 250           mbclen = 1;
 251           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 252           if (BE (pstr->trans != NULL, 0))
 253             wc = pstr->trans[wc];
 254           pstr->cur_state = prev_st;
 255         }
 256
 257       /* Write wide character and padding.  */
 258       pstr->wcs[byte_idx++] = wc;
 259       /* Write paddings.  */
 260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 261         pstr->wcs[byte_idx++] = WEOF;
 262     }
 263   pstr->valid_len = byte_idx;
 264   pstr->valid_raw_len = byte_idx;
 265 }
 266
 267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 268    but for REG_ICASE.  */
 269
 270 static reg_errcode_t
 271 internal_function __attribute_warn_unused_result__
 272 build_wcs_upper_buffer (re_string_t *pstr)
 273 {
 274   mbstate_t prev_st;
 275   Idx src_idx, byte_idx, end_idx, remain_len;
 276   size_t mbclen;
 277 #ifdef _LIBC
 278   char buf[MB_LEN_MAX];
 279   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 280 #else
 281   char buf[64];
 282 #endif
 283
 284   byte_idx = pstr->valid_len;
 285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 286
 287   /* The following optimization assumes that ASCII characters can be
 288      mapped to wide characters with a simple cast.  */
 289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 290     {
 291       while (byte_idx < end_idx)
 292         {
 293           wchar_t wc;
 294
 295           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 296               && mbsinit (&pstr->cur_state))
 297             {
 298               /* In case of a singlebyte character.  */
 299               pstr->mbs[byte_idx]
 300                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 301               /* The next step uses the assumption that wchar_t is encoded
 302                  ASCII-safe: all ASCII values can be converted like this.  */
 303               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 304               ++byte_idx;
 305               continue;
 306             }
 307
 308           remain_len = end_idx - byte_idx;
 309           prev_st = pstr->cur_state;
 310           mbclen = __mbrtowc (&wc,
 311                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 312                                + byte_idx), remain_len, &pstr->cur_state);
 313           if (BE (mbclen < (size_t) -2, 1))
 314             {
 315               wchar_t wcu = wc;
 316               if (iswlower (wc))
 317                 {
 318                   size_t mbcdlen;
 319
 320                   wcu = towupper (wc);
 321                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 322                   if (BE (mbclen == mbcdlen, 1))
 323                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 324                   else
 325                     {
 326                       src_idx = byte_idx;
 327                       goto offsets_needed;
 328                     }
 329                 }
 330               else
 331                 memcpy (pstr->mbs + byte_idx,
 332                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 333               pstr->wcs[byte_idx++] = wcu;
 334               /* Write paddings.  */
 335               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 336                 pstr->wcs[byte_idx++] = WEOF;
 337             }
 338           else if (mbclen == (size_t) -1 || mbclen == 0)
 339             {
 340               /* It is an invalid character or '\0'.  Just use the byte.  */
 341               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 342               pstr->mbs[byte_idx] = ch;
 343               /* And also cast it to wide char.  */
 344               pstr->wcs[byte_idx++] = (wchar_t) ch;
 345               if (BE (mbclen == (size_t) -1, 0))
 346                 pstr->cur_state = prev_st;
 347             }
 348           else
 349             {
 350               /* The buffer doesn't have enough space, finish to build.  */
 351               pstr->cur_state = prev_st;
 352               break;
 353             }
 354         }
 355       pstr->valid_len = byte_idx;
 356       pstr->valid_raw_len = byte_idx;
 357       return REG_NOERROR;
 358     }
 359   else
 360     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 361       {
 362         wchar_t wc;
 363         const char *p;
 364       offsets_needed:
 365         remain_len = end_idx - byte_idx;
 366         prev_st = pstr->cur_state;
 367         if (BE (pstr->trans != NULL, 0))
 368           {
 369             int i, ch;
 370
 371             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 372               {
 373                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 374                 buf[i] = pstr->trans[ch];
 375               }
 376             p = (const char *) buf;
 377           }
 378         else
 379           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 380         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 381         if (BE (mbclen < (size_t) -2, 1))
 382           {
 383             wchar_t wcu = wc;
 384             if (iswlower (wc))
 385               {
 386                 size_t mbcdlen;
 387
 388                 wcu = towupper (wc);
 389                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 390                 if (BE (mbclen == mbcdlen, 1))
 391                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 392                 else if (mbcdlen != (size_t) -1)
 393                   {
 394                     size_t i;
 395
 396                     if (byte_idx + mbcdlen > pstr->bufs_len)
 397                       {
 398                         pstr->cur_state = prev_st;
 399                         break;
 400                       }
 401
 402                     if (pstr->offsets == NULL)
 403                       {
 404                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 405
 406                         if (pstr->offsets == NULL)
 407                           return REG_ESPACE;
 408                       }
 409                     if (!pstr->offsets_needed)
 410                       {
 411                         for (i = 0; i < (size_t) byte_idx; ++i)
 412                           pstr->offsets[i] = i;
 413                         pstr->offsets_needed = 1;
 414                       }
 415
 416                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 417                     pstr->wcs[byte_idx] = wcu;
 418                     pstr->offsets[byte_idx] = src_idx;
 419                     for (i = 1; i < mbcdlen; ++i)
 420                       {
 421                         pstr->offsets[byte_idx + i]
 422                           = src_idx + (i < mbclen ? i : mbclen - 1);
 423                         pstr->wcs[byte_idx + i] = WEOF;
 424                       }
 425                     pstr->len += mbcdlen - mbclen;
 426                     if (pstr->raw_stop > src_idx)
 427                       pstr->stop += mbcdlen - mbclen;
 428                     end_idx = (pstr->bufs_len > pstr->len)
 429                               ? pstr->len : pstr->bufs_len;
 430                     byte_idx += mbcdlen;
 431                     src_idx += mbclen;
 432                     continue;
 433                   }
 434                 else
 435                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 436               }
 437             else
 438               memcpy (pstr->mbs + byte_idx, p, mbclen);
 439
 440             if (BE (pstr->offsets_needed != 0, 0))
 441               {
 442                 size_t i;
 443                 for (i = 0; i < mbclen; ++i)
 444                   pstr->offsets[byte_idx + i] = src_idx + i;
 445               }
 446             src_idx += mbclen;
 447
 448             pstr->wcs[byte_idx++] = wcu;
 449             /* Write paddings.  */
 450             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 451               pstr->wcs[byte_idx++] = WEOF;
 452           }
 453         else if (mbclen == (size_t) -1 || mbclen == 0)
 454           {
 455             /* It is an invalid character or '\0'.  Just use the byte.  */
 456             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 457
 458             if (BE (pstr->trans != NULL, 0))
 459               ch = pstr->trans [ch];
 460             pstr->mbs[byte_idx] = ch;
 461
 462             if (BE (pstr->offsets_needed != 0, 0))
 463               pstr->offsets[byte_idx] = src_idx;
 464             ++src_idx;
 465
 466             /* And also cast it to wide char.  */
 467             pstr->wcs[byte_idx++] = (wchar_t) ch;
 468             if (BE (mbclen == (size_t) -1, 0))
 469               pstr->cur_state = prev_st;
 470           }
 471         else
 472           {
 473             /* The buffer doesn't have enough space, finish to build.  */
 474             pstr->cur_state = prev_st;
 475             break;
 476           }
 477       }
 478   pstr->valid_len = byte_idx;
 479   pstr->valid_raw_len = src_idx;
 480   return REG_NOERROR;
 481 }
 482
 483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 484    Return the index.  */
 485
 486 static Idx
 487 internal_function
 488 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 489 {
 490   mbstate_t prev_st;
 491   Idx rawbuf_idx;
 492   size_t mbclen;
 493   wint_t wc = WEOF;
 494
 495   /* Skip the characters which are not necessary to check.  */
 496   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 497        rawbuf_idx < new_raw_idx;)
 498     {
 499       wchar_t wc2;
 500       Idx remain_len;
 501       remain_len = pstr->len - rawbuf_idx;
 502       prev_st = pstr->cur_state;
 503       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 504                           remain_len, &pstr->cur_state);
 505       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 506         {
 507           /* We treat these cases as a single byte character.  */
 508           if (mbclen == 0 || remain_len == 0)
 509             wc = L'\0';
 510           else
 511             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 512           mbclen = 1;
 513           pstr->cur_state = prev_st;
 514         }
 515       else
 516         wc = wc2;
 517       /* Then proceed the next character.  */
 518       rawbuf_idx += mbclen;
 519     }
 520   *last_wc = wc;
 521   return rawbuf_idx;
 522 }
 523 #endif /* RE_ENABLE_I18N  */
 524
 525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 526    This function is used in case of REG_ICASE.  */
 527
 528 static void
 529 internal_function
 530 build_upper_buffer (re_string_t *pstr)
 531 {
 532   Idx char_idx, end_idx;
 533   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 534
 535   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 536     {
 537       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 538       if (BE (pstr->trans != NULL, 0))
 539         ch = pstr->trans[ch];
 540       if (islower (ch))
 541         pstr->mbs[char_idx] = toupper (ch);
 542       else
 543         pstr->mbs[char_idx] = ch;
 544     }
 545   pstr->valid_len = char_idx;
 546   pstr->valid_raw_len = char_idx;
 547 }
 548
 549 /* Apply TRANS to the buffer in PSTR.  */
 550
 551 static void
 552 internal_function
 553 re_string_translate_buffer (re_string_t *pstr)
 554 {
 555   Idx buf_idx, end_idx;
 556   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 557
 558   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 559     {
 560       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 561       pstr->mbs[buf_idx] = pstr->trans[ch];
 562     }
 563
 564   pstr->valid_len = buf_idx;
 565   pstr->valid_raw_len = buf_idx;
 566 }
 567
 568 /* This function re-construct the buffers.
 569    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 570    convert to upper case in case of REG_ICASE, apply translation.  */
 571
 572 static reg_errcode_t
 573 internal_function __attribute_warn_unused_result__
 574 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 575 {
 576   Idx offset;
 577
 578   if (BE (pstr->raw_mbs_idx <= idx, 0))
 579     offset = idx - pstr->raw_mbs_idx;
 580   else
 581     {
 582       /* Reset buffer.  */
 583 #ifdef RE_ENABLE_I18N
 584       if (pstr->mb_cur_max > 1)
 585         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 586 #endif /* RE_ENABLE_I18N */
 587       pstr->len = pstr->raw_len;
 588       pstr->stop = pstr->raw_stop;
 589       pstr->valid_len = 0;
 590       pstr->raw_mbs_idx = 0;
 591       pstr->valid_raw_len = 0;
 592       pstr->offsets_needed = 0;
 593       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 594                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 595       if (!pstr->mbs_allocated)
 596         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 597       offset = idx;
 598     }
 599
 600   if (BE (offset != 0, 1))
 601     {
 602       /* Should the already checked characters be kept?  */
 603       if (BE (offset < pstr->valid_raw_len, 1))
 604         {
 605           /* Yes, move them to the front of the buffer.  */
 606 #ifdef RE_ENABLE_I18N
 607           if (BE (pstr->offsets_needed, 0))
 608             {
 609               Idx low = 0, high = pstr->valid_len, mid;
 610               do
 611                 {
 612                   mid = (high + low) / 2;
 613                   if (pstr->offsets[mid] > offset)
 614                     high = mid;
 615                   else if (pstr->offsets[mid] < offset)
 616                     low = mid + 1;
 617                   else
 618                     break;
 619                 }
 620               while (low < high);
 621               if (pstr->offsets[mid] < offset)
 622                 ++mid;
 623               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 624                                                         eflags);
 625               /* This can be quite complicated, so handle specially
 626                  only the common and easy case where the character with
 627                  different length representation of lower and upper
 628                  case is present at or after offset.  */
 629               if (pstr->valid_len > offset
 630                   && mid == offset && pstr->offsets[mid] == offset)
 631                 {
 632                   memmove (pstr->wcs, pstr->wcs + offset,
 633                            (pstr->valid_len - offset) * sizeof (wint_t));
 634                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 635                   pstr->valid_len -= offset;
 636                   pstr->valid_raw_len -= offset;
 637                   for (low = 0; low < pstr->valid_len; low++)
 638                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 639                 }
 640               else
 641                 {
 642                   /* Otherwise, just find out how long the partial multibyte
 643                      character at offset is and fill it with WEOF/255.  */
 644                   pstr->len = pstr->raw_len - idx + offset;
 645                   pstr->stop = pstr->raw_stop - idx + offset;
 646                   pstr->offsets_needed = 0;
 647                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 648                     --mid;
 649                   while (mid < pstr->valid_len)
 650                     if (pstr->wcs[mid] != WEOF)
 651                       break;
 652                     else
 653                       ++mid;
 654                   if (mid == pstr->valid_len)
 655                     pstr->valid_len = 0;
 656                   else
 657                     {
 658                       pstr->valid_len = pstr->offsets[mid] - offset;
 659                       if (pstr->valid_len)
 660                         {
 661                           for (low = 0; low < pstr->valid_len; ++low)
 662                             pstr->wcs[low] = WEOF;
 663                           memset (pstr->mbs, 255, pstr->valid_len);
 664                         }
 665                     }
 666                   pstr->valid_raw_len = pstr->valid_len;
 667                 }
 668             }
 669           else
 670 #endif
 671             {
 672               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 673                                                         eflags);
 674 #ifdef RE_ENABLE_I18N
 675               if (pstr->mb_cur_max > 1)
 676                 memmove (pstr->wcs, pstr->wcs + offset,
 677                          (pstr->valid_len - offset) * sizeof (wint_t));
 678 #endif /* RE_ENABLE_I18N */
 679               if (BE (pstr->mbs_allocated, 0))
 680                 memmove (pstr->mbs, pstr->mbs + offset,
 681                          pstr->valid_len - offset);
 682               pstr->valid_len -= offset;
 683               pstr->valid_raw_len -= offset;
 684 #if DEBUG
 685               assert (pstr->valid_len > 0);
 686 #endif
 687             }
 688         }
 689       else
 690         {
 691 #ifdef RE_ENABLE_I18N
 692           /* No, skip all characters until IDX.  */
 693           Idx prev_valid_len = pstr->valid_len;
 694
 695           if (BE (pstr->offsets_needed, 0))
 696             {
 697               pstr->len = pstr->raw_len - idx + offset;
 698               pstr->stop = pstr->raw_stop - idx + offset;
 699               pstr->offsets_needed = 0;
 700             }
 701 #endif
 702           pstr->valid_len = 0;
 703 #ifdef RE_ENABLE_I18N
 704           if (pstr->mb_cur_max > 1)
 705             {
 706               Idx wcs_idx;
 707               wint_t wc = WEOF;
 708
 709               if (pstr->is_utf8)
 710                 {
 711                   const unsigned char *raw, *p, *end;
 712
 713                   /* Special case UTF-8.  Multi-byte chars start with any
 714                      byte other than 0x80 - 0xbf.  */
 715                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 716                   end = raw + (offset - pstr->mb_cur_max);
 717                   if (end < pstr->raw_mbs)
 718                     end = pstr->raw_mbs;
 719                   p = raw + offset - 1;
 720 #ifdef _LIBC
 721                   /* We know the wchar_t encoding is UCS4, so for the simple
 722                      case, ASCII characters, skip the conversion step.  */
 723                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 724                     {
 725                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 726                       /* pstr->valid_len = 0; */
 727                       wc = (wchar_t) *p;
 728                     }
 729                   else
 730 #endif
 731                     for (; p >= end; --p)
 732                       if ((*p & 0xc0) != 0x80)
 733                         {
 734                           mbstate_t cur_state;
 735                           wchar_t wc2;
 736                           Idx mlen = raw + pstr->len - p;
 737                           size_t mbclen;
 738
 739 #if 0 /* dead code: buf is set but never used */
 740                           unsigned char buf[6];
 741                           if (BE (pstr->trans != NULL, 0))
 742                             {
 743                               int i = mlen < 6 ? mlen : 6;
 744                               while (--i >= 0)
 745                                 buf[i] = pstr->trans[p[i]];
 746                             }
 747 #endif
 748                           /* XXX Don't use mbrtowc, we know which conversion
 749                              to use (UTF-8 -> UCS4).  */
 750                           memset (&cur_state, 0, sizeof (cur_state));
 751                           mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 752                                               &cur_state);
 753                           if (raw + offset - p <= mbclen
 754                               && mbclen < (size_t) -2)
 755                             {
 756                               memset (&pstr->cur_state, '\0',
 757                                       sizeof (mbstate_t));
 758                               pstr->valid_len = mbclen - (raw + offset - p);
 759                               wc = wc2;
 760                             }
 761                           break;
 762                         }
 763                 }
 764
 765               if (wc == WEOF)
 766                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 767               if (wc == WEOF)
 768                 pstr->tip_context
 769                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 770               else
 771                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 772                                       && IS_WIDE_WORD_CHAR (wc))
 773                                      ? CONTEXT_WORD
 774                                      : ((IS_WIDE_NEWLINE (wc)
 775                                          && pstr->newline_anchor)
 776                                         ? CONTEXT_NEWLINE : 0));
 777               if (BE (pstr->valid_len, 0))
 778                 {
 779                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 780                     pstr->wcs[wcs_idx] = WEOF;
 781                   if (pstr->mbs_allocated)
 782                     memset (pstr->mbs, 255, pstr->valid_len);
 783                 }
 784               pstr->valid_raw_len = pstr->valid_len;
 785             }
 786           else
 787 #endif /* RE_ENABLE_I18N */
 788             {
 789               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 790               pstr->valid_raw_len = 0;
 791               if (pstr->trans)
 792                 c = pstr->trans[c];
 793               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 794                                    ? CONTEXT_WORD
 795                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 796                                       ? CONTEXT_NEWLINE : 0));
 797             }
 798         }
 799       if (!BE (pstr->mbs_allocated, 0))
 800         pstr->mbs += offset;
 801     }
 802   pstr->raw_mbs_idx = idx;
 803   pstr->len -= offset;
 804   pstr->stop -= offset;
 805
 806   /* Then build the buffers.  */
 807 #ifdef RE_ENABLE_I18N
 808   if (pstr->mb_cur_max > 1)
 809     {
 810       if (pstr->icase)
 811         {
 812           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 813           if (BE (ret != REG_NOERROR, 0))
 814             return ret;
 815         }
 816       else
 817         build_wcs_buffer (pstr);
 818     }
 819   else
 820 #endif /* RE_ENABLE_I18N */
 821     if (BE (pstr->mbs_allocated, 0))
 822       {
 823         if (pstr->icase)
 824           build_upper_buffer (pstr);
 825         else if (pstr->trans != NULL)
 826           re_string_translate_buffer (pstr);
 827       }
 828     else
 829       pstr->valid_len = pstr->len;
 830
 831   pstr->cur_idx = 0;
 832   return REG_NOERROR;
 833 }
 834
 835 static unsigned char
 836 internal_function __attribute ((pure))
 837 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 838 {
 839   int ch;
 840   Idx off;
 841
 842   /* Handle the common (easiest) cases first.  */
 843   if (BE (!pstr->mbs_allocated, 1))
 844     return re_string_peek_byte (pstr, idx);
 845
 846 #ifdef RE_ENABLE_I18N
 847   if (pstr->mb_cur_max > 1
 848       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 849     return re_string_peek_byte (pstr, idx);
 850 #endif
 851
 852   off = pstr->cur_idx + idx;
 853 #ifdef RE_ENABLE_I18N
 854   if (pstr->offsets_needed)
 855     off = pstr->offsets[off];
 856 #endif
 857
 858   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 859
 860 #ifdef RE_ENABLE_I18N
 861   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 862      this function returns CAPITAL LETTER I instead of first byte of
 863      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 864      since peek_byte_case doesn't advance cur_idx in any way.  */
 865   if (pstr->offsets_needed && !isascii (ch))
 866     return re_string_peek_byte (pstr, idx);
 867 #endif
 868
 869   return ch;
 870 }
 871
 872 static unsigned char
 873 internal_function __attribute ((pure))
 874 re_string_fetch_byte_case (re_string_t *pstr)
 875 {
 876   if (BE (!pstr->mbs_allocated, 1))
 877     return re_string_fetch_byte (pstr);
 878
 879 #ifdef RE_ENABLE_I18N
 880   if (pstr->offsets_needed)
 881     {
 882       Idx off;
 883       int ch;
 884
 885       /* For tr_TR.UTF-8 [[:islower:]] there is
 886          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 887          in that case the whole multi-byte character and return
 888          the original letter.  On the other side, with
 889          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 890          anything else would complicate things too much.  */
 891
 892       if (!re_string_first_byte (pstr, pstr->cur_idx))
 893         return re_string_fetch_byte (pstr);
 894
 895       off = pstr->offsets[pstr->cur_idx];
 896       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 897
 898       if (! isascii (ch))
 899         return re_string_fetch_byte (pstr);
 900
 901       re_string_skip_bytes (pstr,
 902                             re_string_char_size_at (pstr, pstr->cur_idx));
 903       return ch;
 904     }
 905 #endif
 906
 907   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 908 }
 909
 910 static void
 911 internal_function
 912 re_string_destruct (re_string_t *pstr)
 913 {
 914 #ifdef RE_ENABLE_I18N
 915   re_free (pstr->wcs);
 916   re_free (pstr->offsets);
 917 #endif /* RE_ENABLE_I18N  */
 918   if (pstr->mbs_allocated)
 919     re_free (pstr->mbs);
 920 }
 921
 922 /* Return the context at IDX in INPUT.  */
 923
 924 static unsigned int
 925 internal_function
 926 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 927 {
 928   int c;
 929   if (BE (! REG_VALID_INDEX (idx), 0))
 930     /* In this case, we use the value stored in input->tip_context,
 931        since we can't know the character in input->mbs[-1] here.  */
 932     return input->tip_context;
 933   if (BE (idx == input->len, 0))
 934     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 935             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 936 #ifdef RE_ENABLE_I18N
 937   if (input->mb_cur_max > 1)
 938     {
 939       wint_t wc;
 940       Idx wc_idx = idx;
 941       while(input->wcs[wc_idx] == WEOF)
 942         {
 943 #ifdef DEBUG
 944           /* It must not happen.  */
 945           assert (REG_VALID_INDEX (wc_idx));
 946 #endif
 947           --wc_idx;
 948           if (! REG_VALID_INDEX (wc_idx))
 949             return input->tip_context;
 950         }
 951       wc = input->wcs[wc_idx];
 952       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 953         return CONTEXT_WORD;
 954       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 955               ? CONTEXT_NEWLINE : 0);
 956     }
 957   else
 958 #endif
 959     {
 960       c = re_string_byte_at (input, idx);
 961       if (bitset_contain (input->word_char, c))
 962         return CONTEXT_WORD;
 963       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 964     }
 965 }
 966 \f
 967 /* Functions for set operation.  */
 968
 969 static reg_errcode_t
 970 internal_function __attribute_warn_unused_result__
 971 re_node_set_alloc (re_node_set *set, Idx size)
 972 {
 973   set->alloc = size;
 974   set->nelem = 0;
 975   set->elems = re_malloc (Idx, size);
 976   if (BE (set->elems == NULL, 0))
 977     return REG_ESPACE;
 978   return REG_NOERROR;
 979 }
 980
 981 static reg_errcode_t
 982 internal_function __attribute_warn_unused_result__
 983 re_node_set_init_1 (re_node_set *set, Idx elem)
 984 {
 985   set->alloc = 1;
 986   set->nelem = 1;
 987   set->elems = re_malloc (Idx, 1);
 988   if (BE (set->elems == NULL, 0))
 989     {
 990       set->alloc = set->nelem = 0;
 991       return REG_ESPACE;
 992     }
 993   set->elems[0] = elem;
 994   return REG_NOERROR;
 995 }
 996
 997 static reg_errcode_t
 998 internal_function __attribute_warn_unused_result__
 999 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000 {
1001   set->alloc = 2;
1002   set->elems = re_malloc (Idx, 2);
1003   if (BE (set->elems == NULL, 0))
1004     return REG_ESPACE;
1005   if (elem1 == elem2)
1006     {
1007       set->nelem = 1;
1008       set->elems[0] = elem1;
1009     }
1010   else
1011     {
1012       set->nelem = 2;
1013       if (elem1 < elem2)
1014         {
1015           set->elems[0] = elem1;
1016           set->elems[1] = elem2;
1017         }
1018       else
1019         {
1020           set->elems[0] = elem2;
1021           set->elems[1] = elem1;
1022         }
1023     }
1024   return REG_NOERROR;
1025 }
1026
1027 static reg_errcode_t
1028 internal_function __attribute_warn_unused_result__
1029 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1030 {
1031   dest->nelem = src->nelem;
1032   if (src->nelem > 0)
1033     {
1034       dest->alloc = dest->nelem;
1035       dest->elems = re_malloc (Idx, dest->alloc);
1036       if (BE (dest->elems == NULL, 0))
1037         {
1038           dest->alloc = dest->nelem = 0;
1039           return REG_ESPACE;
1040         }
1041       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042     }
1043   else
1044     re_node_set_init_empty (dest);
1045   return REG_NOERROR;
1046 }
1047
1048 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1051
1052 static reg_errcode_t
1053 internal_function __attribute_warn_unused_result__
1054 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1055                            const re_node_set *src2)
1056 {
1057   Idx i1, i2, is, id, delta, sbase;
1058   if (src1->nelem == 0 || src2->nelem == 0)
1059     return REG_NOERROR;
1060
1061   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1062      conservative estimate.  */
1063   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064     {
1065       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067       if (BE (new_elems == NULL, 0))
1068         return REG_ESPACE;
1069       dest->elems = new_elems;
1070       dest->alloc = new_alloc;
1071     }
1072
1073   /* Find the items in the intersection of SRC1 and SRC2, and copy
1074      into the top of DEST those that are not already in DEST itself.  */
1075   sbase = dest->nelem + src1->nelem + src2->nelem;
1076   i1 = src1->nelem - 1;
1077   i2 = src2->nelem - 1;
1078   id = dest->nelem - 1;
1079   for (;;)
1080     {
1081       if (src1->elems[i1] == src2->elems[i2])
1082         {
1083           /* Try to find the item in DEST.  Maybe we could binary search?  */
1084           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085             --id;
1086
1087           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1088             dest->elems[--sbase] = src1->elems[i1];
1089
1090           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1091             break;
1092         }
1093
1094       /* Lower the highest of the two items.  */
1095       else if (src1->elems[i1] < src2->elems[i2])
1096         {
1097           if (! REG_VALID_INDEX (--i2))
1098             break;
1099         }
1100       else
1101         {
1102           if (! REG_VALID_INDEX (--i1))
1103             break;
1104         }
1105     }
1106
1107   id = dest->nelem - 1;
1108   is = dest->nelem + src1->nelem + src2->nelem - 1;
1109   delta = is - sbase + 1;
1110
1111   /* Now copy.  When DELTA becomes zero, the remaining
1112      DEST elements are already in place; this is more or
1113      less the same loop that is in re_node_set_merge.  */
1114   dest->nelem += delta;
1115   if (delta > 0 && REG_VALID_INDEX (id))
1116     for (;;)
1117       {
1118         if (dest->elems[is] > dest->elems[id])
1119           {
1120             /* Copy from the top.  */
1121             dest->elems[id + delta--] = dest->elems[is--];
1122             if (delta == 0)
1123               break;
1124           }
1125         else
1126           {
1127             /* Slide from the bottom.  */
1128             dest->elems[id + delta] = dest->elems[id];
1129             if (! REG_VALID_INDEX (--id))
1130               break;
1131           }
1132       }
1133
1134   /* Copy remaining SRC elements.  */
1135   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137   return REG_NOERROR;
1138 }
1139
1140 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1142
1143 static reg_errcode_t
1144 internal_function __attribute_warn_unused_result__
1145 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1146                         const re_node_set *src2)
1147 {
1148   Idx i1, i2, id;
1149   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150     {
1151       dest->alloc = src1->nelem + src2->nelem;
1152       dest->elems = re_malloc (Idx, dest->alloc);
1153       if (BE (dest->elems == NULL, 0))
1154         return REG_ESPACE;
1155     }
1156   else
1157     {
1158       if (src1 != NULL && src1->nelem > 0)
1159         return re_node_set_init_copy (dest, src1);
1160       else if (src2 != NULL && src2->nelem > 0)
1161         return re_node_set_init_copy (dest, src2);
1162       else
1163         re_node_set_init_empty (dest);
1164       return REG_NOERROR;
1165     }
1166   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167     {
1168       if (src1->elems[i1] > src2->elems[i2])
1169         {
1170           dest->elems[id++] = src2->elems[i2++];
1171           continue;
1172         }
1173       if (src1->elems[i1] == src2->elems[i2])
1174         ++i2;
1175       dest->elems[id++] = src1->elems[i1++];
1176     }
1177   if (i1 < src1->nelem)
1178     {
1179       memcpy (dest->elems + id, src1->elems + i1,
1180              (src1->nelem - i1) * sizeof (Idx));
1181       id += src1->nelem - i1;
1182     }
1183   else if (i2 < src2->nelem)
1184     {
1185       memcpy (dest->elems + id, src2->elems + i2,
1186              (src2->nelem - i2) * sizeof (Idx));
1187       id += src2->nelem - i2;
1188     }
1189   dest->nelem = id;
1190   return REG_NOERROR;
1191 }
1192
1193 /* Calculate the union set of the sets DEST and SRC. And store it to
1194    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1195
1196 static reg_errcode_t
1197 internal_function __attribute_warn_unused_result__
1198 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1199 {
1200   Idx is, id, sbase, delta;
1201   if (src == NULL || src->nelem == 0)
1202     return REG_NOERROR;
1203   if (dest->alloc < 2 * src->nelem + dest->nelem)
1204     {
1205       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207       if (BE (new_buffer == NULL, 0))
1208         return REG_ESPACE;
1209       dest->elems = new_buffer;
1210       dest->alloc = new_alloc;
1211     }
1212
1213   if (BE (dest->nelem == 0, 0))
1214     {
1215       dest->nelem = src->nelem;
1216       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217       return REG_NOERROR;
1218     }
1219
1220   /* Copy into the top of DEST the items of SRC that are not
1221      found in DEST.  Maybe we could binary search in DEST?  */
1222   for (sbase = dest->nelem + 2 * src->nelem,
1223        is = src->nelem - 1, id = dest->nelem - 1;
1224        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225     {
1226       if (dest->elems[id] == src->elems[is])
1227         is--, id--;
1228       else if (dest->elems[id] < src->elems[is])
1229         dest->elems[--sbase] = src->elems[is--];
1230       else /* if (dest->elems[id] > src->elems[is]) */
1231         --id;
1232     }
1233
1234   if (REG_VALID_INDEX (is))
1235     {
1236       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1237       sbase -= is + 1;
1238       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239     }
1240
1241   id = dest->nelem - 1;
1242   is = dest->nelem + 2 * src->nelem - 1;
1243   delta = is - sbase + 1;
1244   if (delta == 0)
1245     return REG_NOERROR;
1246
1247   /* Now copy.  When DELTA becomes zero, the remaining
1248      DEST elements are already in place.  */
1249   dest->nelem += delta;
1250   for (;;)
1251     {
1252       if (dest->elems[is] > dest->elems[id])
1253         {
1254           /* Copy from the top.  */
1255           dest->elems[id + delta--] = dest->elems[is--];
1256           if (delta == 0)
1257             break;
1258         }
1259       else
1260         {
1261           /* Slide from the bottom.  */
1262           dest->elems[id + delta] = dest->elems[id];
1263           if (! REG_VALID_INDEX (--id))
1264             {
1265               /* Copy remaining SRC elements.  */
1266               memcpy (dest->elems, dest->elems + sbase,
1267                       delta * sizeof (Idx));
1268               break;
1269             }
1270         }
1271     }
1272
1273   return REG_NOERROR;
1274 }
1275
1276 /* Insert the new element ELEM to the re_node_set* SET.
1277    SET should not already have ELEM.
1278    Return true if successful.  */
1279
1280 static bool
1281 internal_function __attribute_warn_unused_result__
1282 re_node_set_insert (re_node_set *set, Idx elem)
1283 {
1284   Idx idx;
1285   /* In case the set is empty.  */
1286   if (set->alloc == 0)
1287     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289   if (BE (set->nelem, 0) == 0)
1290     {
1291       /* We already guaranteed above that set->alloc != 0.  */
1292       set->elems[0] = elem;
1293       ++set->nelem;
1294       return true;
1295     }
1296
1297   /* Realloc if we need.  */
1298   if (set->alloc == set->nelem)
1299     {
1300       Idx *new_elems;
1301       set->alloc = set->alloc * 2;
1302       new_elems = re_realloc (set->elems, Idx, set->alloc);
1303       if (BE (new_elems == NULL, 0))
1304         return false;
1305       set->elems = new_elems;
1306     }
1307
1308   /* Move the elements which follows the new element.  Test the
1309      first element separately to skip a check in the inner loop.  */
1310   if (elem < set->elems[0])
1311     {
1312       idx = 0;
1313       for (idx = set->nelem; idx > 0; idx--)
1314         set->elems[idx] = set->elems[idx - 1];
1315     }
1316   else
1317     {
1318       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319         set->elems[idx] = set->elems[idx - 1];
1320     }
1321
1322   /* Insert the new element.  */
1323   set->elems[idx] = elem;
1324   ++set->nelem;
1325   return true;
1326 }
1327
1328 /* Insert the new element ELEM to the re_node_set* SET.
1329    SET should not already have any element greater than or equal to ELEM.
1330    Return true if successful.  */
1331
1332 static bool
1333 internal_function __attribute_warn_unused_result__
1334 re_node_set_insert_last (re_node_set *set, Idx elem)
1335 {
1336   /* Realloc if we need.  */
1337   if (set->alloc == set->nelem)
1338     {
1339       Idx *new_elems;
1340       set->alloc = (set->alloc + 1) * 2;
1341       new_elems = re_realloc (set->elems, Idx, set->alloc);
1342       if (BE (new_elems == NULL, 0))
1343         return false;
1344       set->elems = new_elems;
1345     }
1346
1347   /* Insert the new element.  */
1348   set->elems[set->nelem++] = elem;
1349   return true;
1350 }
1351
1352 /* Compare two node sets SET1 and SET2.
1353    Return true if SET1 and SET2 are equivalent.  */
1354
1355 static bool
1356 internal_function __attribute ((pure))
1357 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358 {
1359   Idx i;
1360   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361     return false;
1362   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363     if (set1->elems[i] != set2->elems[i])
1364       return false;
1365   return true;
1366 }
1367
1368 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1369
1370 static Idx
1371 internal_function __attribute ((pure))
1372 re_node_set_contains (const re_node_set *set, Idx elem)
1373 {
1374   __re_size_t idx, right, mid;
1375   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376     return 0;
1377
1378   /* Binary search the element.  */
1379   idx = 0;
1380   right = set->nelem - 1;
1381   while (idx < right)
1382     {
1383       mid = (idx + right) / 2;
1384       if (set->elems[mid] < elem)
1385         idx = mid + 1;
1386       else
1387         right = mid;
1388     }
1389   return set->elems[idx] == elem ? idx + 1 : 0;
1390 }
1391
1392 static void
1393 internal_function
1394 re_node_set_remove_at (re_node_set *set, Idx idx)
1395 {
1396   if (idx < 0 || idx >= set->nelem)
1397     return;
1398   --set->nelem;
1399   for (; idx < set->nelem; idx++)
1400     set->elems[idx] = set->elems[idx + 1];
1401 }
1402 \f
1403
1404 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1405    Or return REG_MISSING if an error occurred.  */
1406
1407 static Idx
1408 internal_function
1409 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 {
1411   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1412     {
1413       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414       Idx *new_nexts, *new_indices;
1415       re_node_set *new_edests, *new_eclosures;
1416       re_token_t *new_nodes;
1417       size_t max_object_size =
1418         MAX (sizeof (re_token_t),
1419              MAX (sizeof (re_node_set),
1420                   sizeof (Idx)));
1421
1422       /* Avoid overflows.  */
1423       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1424         return REG_MISSING;
1425
1426       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427       if (BE (new_nodes == NULL, 0))
1428         return REG_MISSING;
1429       dfa->nodes = new_nodes;
1430       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434       if (BE (new_nexts == NULL || new_indices == NULL
1435               || new_edests == NULL || new_eclosures == NULL, 0))
1436         return REG_MISSING;
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   {
1447   int type = token.type;
1448   dfa->nodes[dfa->nodes_len].accept_mb =
1449     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1450   }
1451 #endif
1452   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1453   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1454   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1455   return dfa->nodes_len++;
1456 }
1457
1458 static inline re_hashval_t
1459 internal_function
1460 calc_state_hash (const re_node_set *nodes, unsigned int context)
1461 {
1462   re_hashval_t hash = nodes->nelem + context;
1463   Idx i;
1464   for (i = 0 ; i < nodes->nelem ; i++)
1465     hash += nodes->elems[i];
1466   return hash;
1467 }
1468
1469 /* Search for the state whose node_set is equivalent to NODES.
1470    Return the pointer to the state, if we found it in the DFA.
1471    Otherwise create the new one and return it.  In case of an error
1472    return NULL and set the error code in ERR.
1473    Note: - We assume NULL as the invalid state, then it is possible that
1474            return value is NULL and ERR is REG_NOERROR.
1475          - We never return non-NULL value in case of any errors, it is for
1476            optimization.  */
1477
1478 static re_dfastate_t *
1479 internal_function __attribute_warn_unused_result__
1480 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1481                   const re_node_set *nodes)
1482 {
1483   re_hashval_t hash;
1484   re_dfastate_t *new_state;
1485   struct re_state_table_entry *spot;
1486   Idx i;
1487 #ifdef lint
1488   /* Suppress bogus uninitialized-variable warnings.  */
1489   *err = REG_NOERROR;
1490 #endif
1491   if (BE (nodes->nelem == 0, 0))
1492     {
1493       *err = REG_NOERROR;
1494       return NULL;
1495     }
1496   hash = calc_state_hash (nodes, 0);
1497   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1498
1499   for (i = 0 ; i < spot->num ; i++)
1500     {
1501       re_dfastate_t *state = spot->array[i];
1502       if (hash != state->hash)
1503         continue;
1504       if (re_node_set_compare (&state->nodes, nodes))
1505         return state;
1506     }
1507
1508   /* There are no appropriate state in the dfa, create the new one.  */
1509   new_state = create_ci_newstate (dfa, nodes, hash);
1510   if (BE (new_state == NULL, 0))
1511     *err = REG_ESPACE;
1512
1513   return new_state;
1514 }
1515
1516 /* Search for the state whose node_set is equivalent to NODES and
1517    whose context is equivalent to CONTEXT.
1518    Return the pointer to the state, if we found it in the DFA.
1519    Otherwise create the new one and return it.  In case of an error
1520    return NULL and set the error code in ERR.
1521    Note: - We assume NULL as the invalid state, then it is possible that
1522            return value is NULL and ERR is REG_NOERROR.
1523          - We never return non-NULL value in case of any errors, it is for
1524            optimization.  */
1525
1526 static re_dfastate_t *
1527 internal_function __attribute_warn_unused_result__
1528 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1529                           const re_node_set *nodes, unsigned int context)
1530 {
1531   re_hashval_t hash;
1532   re_dfastate_t *new_state;
1533   struct re_state_table_entry *spot;
1534   Idx i;
1535 #ifdef lint
1536   /* Suppress bogus uninitialized-variable warnings.  */
1537   *err = REG_NOERROR;
1538 #endif
1539   if (nodes->nelem == 0)
1540     {
1541       *err = REG_NOERROR;
1542       return NULL;
1543     }
1544   hash = calc_state_hash (nodes, context);
1545   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1546
1547   for (i = 0 ; i < spot->num ; i++)
1548     {
1549       re_dfastate_t *state = spot->array[i];
1550       if (state->hash == hash
1551           && state->context == context
1552           && re_node_set_compare (state->entrance_nodes, nodes))
1553         return state;
1554     }
1555   /* There are no appropriate state in `dfa', create the new one.  */
1556   new_state = create_cd_newstate (dfa, nodes, context, hash);
1557   if (BE (new_state == NULL, 0))
1558     *err = REG_ESPACE;
1559
1560   return new_state;
1561 }
1562
1563 /* Finish initialization of the new state NEWSTATE, and using its hash value
1564    HASH put in the appropriate bucket of DFA's state table.  Return value
1565    indicates the error code if failed.  */
1566
1567 static reg_errcode_t
1568 __attribute_warn_unused_result__
1569 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570                 re_hashval_t hash)
1571 {
1572   struct re_state_table_entry *spot;
1573   reg_errcode_t err;
1574   Idx i;
1575
1576   newstate->hash = hash;
1577   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578   if (BE (err != REG_NOERROR, 0))
1579     return REG_ESPACE;
1580   for (i = 0; i < newstate->nodes.nelem; i++)
1581     {
1582       Idx elem = newstate->nodes.elems[i];
1583       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584         if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1585           return REG_ESPACE;
1586     }
1587
1588   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589   if (BE (spot->alloc <= spot->num, 0))
1590     {
1591       Idx new_alloc = 2 * spot->num + 2;
1592       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593                                               new_alloc);
1594       if (BE (new_array == NULL, 0))
1595         return REG_ESPACE;
1596       spot->array = new_array;
1597       spot->alloc = new_alloc;
1598     }
1599   spot->array[spot->num++] = newstate;
1600   return REG_NOERROR;
1601 }
1602
1603 static void
1604 free_state (re_dfastate_t *state)
1605 {
1606   re_node_set_free (&state->non_eps_nodes);
1607   re_node_set_free (&state->inveclosure);
1608   if (state->entrance_nodes != &state->nodes)
1609     {
1610       re_node_set_free (state->entrance_nodes);
1611       re_free (state->entrance_nodes);
1612     }
1613   re_node_set_free (&state->nodes);
1614   re_free (state->word_trtable);
1615   re_free (state->trtable);
1616   re_free (state);
1617 }
1618
1619 /* Create the new state which is independ of contexts.
1620    Return the new state if succeeded, otherwise return NULL.  */
1621
1622 static re_dfastate_t *
1623 internal_function __attribute_warn_unused_result__
1624 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625                     re_hashval_t hash)
1626 {
1627   Idx i;
1628   reg_errcode_t err;
1629   re_dfastate_t *newstate;
1630
1631   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632   if (BE (newstate == NULL, 0))
1633     return NULL;
1634   err = re_node_set_init_copy (&newstate->nodes, nodes);
1635   if (BE (err != REG_NOERROR, 0))
1636     {
1637       re_free (newstate);
1638       return NULL;
1639     }
1640
1641   newstate->entrance_nodes = &newstate->nodes;
1642   for (i = 0 ; i < nodes->nelem ; i++)
1643     {
1644       re_token_t *node = dfa->nodes + nodes->elems[i];
1645       re_token_type_t type = node->type;
1646       if (type == CHARACTER && !node->constraint)
1647         continue;
1648 #ifdef RE_ENABLE_I18N
1649       newstate->accept_mb |= node->accept_mb;
1650 #endif /* RE_ENABLE_I18N */
1651
1652       /* If the state has the halt node, the state is a halt state.  */
1653       if (type == END_OF_RE)
1654         newstate->halt = 1;
1655       else if (type == OP_BACK_REF)
1656         newstate->has_backref = 1;
1657       else if (type == ANCHOR || node->constraint)
1658         newstate->has_constraint = 1;
1659     }
1660   err = register_state (dfa, newstate, hash);
1661   if (BE (err != REG_NOERROR, 0))
1662     {
1663       free_state (newstate);
1664       newstate = NULL;
1665     }
1666   return newstate;
1667 }
1668
1669 /* Create the new state which is depend on the context CONTEXT.
1670    Return the new state if succeeded, otherwise return NULL.  */
1671
1672 static re_dfastate_t *
1673 internal_function __attribute_warn_unused_result__
1674 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675                     unsigned int context, re_hashval_t hash)
1676 {
1677   Idx i, nctx_nodes = 0;
1678   reg_errcode_t err;
1679   re_dfastate_t *newstate;
1680
1681   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682   if (BE (newstate == NULL, 0))
1683     return NULL;
1684   err = re_node_set_init_copy (&newstate->nodes, nodes);
1685   if (BE (err != REG_NOERROR, 0))
1686     {
1687       re_free (newstate);
1688       return NULL;
1689     }
1690
1691   newstate->context = context;
1692   newstate->entrance_nodes = &newstate->nodes;
1693
1694   for (i = 0 ; i < nodes->nelem ; i++)
1695     {
1696       re_token_t *node = dfa->nodes + nodes->elems[i];
1697       re_token_type_t type = node->type;
1698       unsigned int constraint = node->constraint;
1699
1700       if (type == CHARACTER && !constraint)
1701         continue;
1702 #ifdef RE_ENABLE_I18N
1703       newstate->accept_mb |= node->accept_mb;
1704 #endif /* RE_ENABLE_I18N */
1705
1706       /* If the state has the halt node, the state is a halt state.  */
1707       if (type == END_OF_RE)
1708         newstate->halt = 1;
1709       else if (type == OP_BACK_REF)
1710         newstate->has_backref = 1;
1711
1712       if (constraint)
1713         {
1714           if (newstate->entrance_nodes == &newstate->nodes)
1715             {
1716               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717               if (BE (newstate->entrance_nodes == NULL, 0))
1718                 {
1719                   free_state (newstate);
1720                   return NULL;
1721                 }
1722               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723                   != REG_NOERROR)
1724                 return NULL;
1725               nctx_nodes = 0;
1726               newstate->has_constraint = 1;
1727             }
1728
1729           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1730             {
1731               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1732               ++nctx_nodes;
1733             }
1734         }
1735     }
1736   err = register_state (dfa, newstate, hash);
1737   if (BE (err != REG_NOERROR, 0))
1738     {
1739       free_state (newstate);
1740       newstate = NULL;
1741     }
1742   return  newstate;
1743 }