/* Extended regular expression matching and search library.
- Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
- Free Software Foundation, Inc.
+ Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
+ Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
+ the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
applies to multibyte character sets; for single byte character
sets, the SIMPLE_BRACKET again suffices. */
if (dfa->mb_cur_max > 1
- && (cset->nchar_classes || cset->non_match
+ && (cset->nchar_classes || cset->non_match || cset->nranges
# ifdef _LIBC
|| cset->nequiv_classes
# endif /* _LIBC */
re_dfastate_t *state = entry->array[j];
free_state (state);
}
- re_free (entry->array);
+ re_free (entry->array);
}
re_free (dfa->state_table);
#ifdef RE_ENABLE_I18N
init_dfa (re_dfa_t *dfa, size_t pat_len)
{
__re_size_t table_size;
+#ifndef _LIBC
+ char *codeset_name;
+#endif
#ifdef RE_ENABLE_I18N
size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
#else
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0);
#else
- if (strcmp (locale_charset (), "UTF-8") == 0)
+ codeset_name = nl_langinfo (CODESET);
+ if (strcasecmp (codeset_name, "UTF-8") == 0
+ || strcasecmp (codeset_name, "UTF8") == 0)
dfa->is_utf8 = 1;
/* We check exhaustively in the loop below if this charset is a
Idx dest_idx = dfa->edests[node_idx].elems[0];
if (!re_node_set_contains (&init_nodes, dest_idx))
{
- re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
+ reg_errcode_t merge_err
+ = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
+ if (merge_err != REG_NOERROR)
+ return merge_err;
i = 0;
}
}
}
break;
case OP_PERIOD:
- has_period = true;
- break;
+ has_period = true;
+ break;
case OP_BACK_REF:
case OP_ALT:
case END_OF_RE:
{
dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
if (BE (dfa->inveclosures == NULL, 0))
- return REG_ESPACE;
+ return REG_ESPACE;
ret = calc_inveclosure (dfa);
}
if that's the only child). */
while (node->left || node->right)
if (node->left)
- node = node->left;
- else
- node = node->right;
+ node = node->left;
+ else
+ node = node->right;
do
{
reg_errcode_t err = fn (extra, node);
if (BE (err != REG_NOERROR, 0))
return err;
- if (node->parent == NULL)
+ if (node->parent == NULL)
return REG_NOERROR;
prev = node;
node = node->parent;
prev = node;
node = node->parent;
if (!node)
- return REG_NOERROR;
+ return REG_NOERROR;
}
node = node->right;
}
}
else if (node->token.type == SUBEXP
- && node->left && node->left->token.type == SUBEXP)
+ && node->left && node->left->token.type == SUBEXP)
{
Idx other_idx = node->left->token.opr.idx;
node->left = node->left->left;
if (node->left)
- node->left->parent = node;
+ node->left->parent = node;
dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
if (other_idx < BITSET_WORD_BITS)
node->first = node;
node->node_idx = re_dfa_add_node (dfa, node->token);
if (BE (node->node_idx == REG_MISSING, 0))
- return REG_ESPACE;
+ return REG_ESPACE;
if (node->token.type == ANCHOR)
- dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
+ dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
}
return REG_NOERROR;
}
if (node->left)
node->left->next = node->next;
if (node->right)
- node->right->next = node->next;
+ node->right->next = node->next;
break;
}
return REG_NOERROR;
case OP_BACK_REF:
dfa->nexts[idx] = node->next->node_idx;
if (node->token.type == OP_BACK_REF)
- re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
+ err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
break;
default:
destination. */
org_dest = dfa->edests[org_node].elems[0];
re_node_set_empty (dfa->edests + clone_node);
- clone_dest = search_duplicated_node (dfa, org_dest, constraint);
/* If the node is root_node itself, it means the epsilon closure
has a loop. Then tie it to the destination of the root_node. */
if (org_node == root_node && clone_node != org_node)
}
else
{
- /* There is a duplicated node which satisfy the constraint,
+ /* There is a duplicated node which satisfies the constraint,
use it to avoid infinite loop. */
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (! ok, 0))
{
reg_errcode_t err;
Idx i;
- bool incomplete;
- bool ok;
re_node_set eclosure;
- incomplete = false;
+ bool ok;
+ bool incomplete = false;
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
if (BE (err != REG_NOERROR, 0))
return err;
else
eclosure_elem = dfa->eclosures[edest];
/* Merge the epsilon closure of `edest'. */
- re_node_set_merge (&eclosure, &eclosure_elem);
+ err = re_node_set_merge (&eclosure, &eclosure_elem);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
/* If the epsilon closure of `edest' is incomplete,
the epsilon closure of this node is also incomplete. */
if (dfa->eclosures[edest].nelem == 0)
}
}
- /* Epsilon closures include itself. */
+ /* An epsilon closure includes itself. */
ok = re_node_set_insert (&eclosure, node);
if (BE (! ok, 0))
return REG_ESPACE;
&& dfa->word_ops_used == 0)
init_word_char (dfa);
if (token->opr.ctx_type == WORD_DELIM
- || token->opr.ctx_type == NOT_WORD_DELIM)
+ || token->opr.ctx_type == NOT_WORD_DELIM)
{
bin_tree_t *tree_first, *tree_last;
if (token->opr.ctx_type == WORD_DELIM)
token->opr.ctx_type = WORD_FIRST;
tree_first = create_token_tree (dfa, NULL, NULL, token);
token->opr.ctx_type = WORD_LAST;
- }
- else
- {
+ }
+ else
+ {
token->opr.ctx_type = INSIDE_WORD;
tree_first = create_token_tree (dfa, NULL, NULL, token);
token->opr.ctx_type = INSIDE_NOTWORD;
- }
+ }
tree_last = create_token_tree (dfa, NULL, NULL, token);
tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
{
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
- *err = REG_EPAREN;
+ *err = REG_EPAREN;
if (BE (*err != REG_NOERROR, 0))
return NULL;
}
return elem;
}
- if (BE (end != REG_MISSING && start > end, 0))
+ if (BE ((end != REG_MISSING && start > end)
+ || token->type != OP_CLOSE_DUP_NUM, 0))
{
/* First number greater than second. */
*err = REG_BADBR;
if (BE (tree == NULL, 0))
goto parse_dup_op_espace;
+/* From gnulib's "intprops.h":
+ True if the arithmetic type T is signed. */
+#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
+
/* This loop is actually executed only when end != REG_MISSING,
to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
already created the start+1-th copy. */
- if ((Idx) -1 < 0 || end != REG_MISSING)
+ if (TYPE_SIGNED (Idx) || end != REG_MISSING)
for (i = start + 2; i <= end; ++i)
{
elem = duplicate_tree (elem, dfa);
static reg_errcode_t
internal_function
# ifdef RE_ENABLE_I18N
-build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
- bracket_elem_t *start_elem, bracket_elem_t *end_elem)
+build_range_exp (const reg_syntax_t syntax,
+ bitset_t sbcset,
+ re_charset_t *mbcset,
+ Idx *range_alloc,
+ const bracket_elem_t *start_elem,
+ const bracket_elem_t *end_elem)
# else /* not RE_ENABLE_I18N */
-build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
- bracket_elem_t *end_elem)
+build_range_exp (const reg_syntax_t syntax,
+ bitset_t sbcset,
+ const bracket_elem_t *start_elem,
+ const bracket_elem_t *end_elem)
# endif /* not RE_ENABLE_I18N */
{
unsigned int start_ch, end_ch;
return REG_ECOLLATE;
cmp_buf[0] = start_wc;
cmp_buf[4] = end_wc;
- if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
+
+ if (BE ((syntax & RE_NO_EMPTY_RANGES)
+ && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
no MBCSET if dfa->mb_cur_max == 1. */
if (mbcset)
{
- /* Check the space of the arrays. */
- if (BE (*range_alloc == mbcset->nranges, 0))
- {
+ /* Check the space of the arrays. */
+ if (BE (*range_alloc == mbcset->nranges, 0))
+ {
/* There is not enough space, need realloc. */
wchar_t *new_array_start, *new_array_end;
Idx new_nranges;
/* Use realloc since mbcset->range_starts and mbcset->range_ends
are NULL if *range_alloc == 0. */
new_array_start = re_realloc (mbcset->range_starts, wchar_t,
- new_nranges);
+ new_nranges);
new_array_end = re_realloc (mbcset->range_ends, wchar_t,
- new_nranges);
+ new_nranges);
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
return REG_ESPACE;
mbcset->range_starts = new_array_start;
mbcset->range_ends = new_array_end;
*range_alloc = new_nranges;
- }
+ }
- mbcset->range_starts[mbcset->nranges] = start_wc;
- mbcset->range_ends[mbcset->nranges++] = end_wc;
+ mbcset->range_starts[mbcset->nranges] = start_wc;
+ mbcset->range_ends[mbcset->nranges++] = end_wc;
}
/* Build the table for single byte characters. */
return elem;
}
- /* Local function for parse_bracket_exp used in _LIBC environement.
+ /* Local function for parse_bracket_exp used in _LIBC environment.
Look up the collation sequence value of BR_ELEM.
Return the value if succeeded, UINT_MAX otherwise. */
}
else if (br_elem->type == MB_CHAR)
{
- return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+ if (nrules != 0)
+ return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
}
else if (br_elem->type == COLL_SYM)
{
build below suffices. */
if (nrules > 0 || dfa->mb_cur_max > 1)
{
- /* Check the space of the arrays. */
- if (BE (*range_alloc == mbcset->nranges, 0))
+ /* Check the space of the arrays. */
+ if (BE (*range_alloc == mbcset->nranges, 0))
{
/* There is not enough space, need realloc. */
uint32_t *new_array_start;
new_array_start = re_realloc (mbcset->range_starts, uint32_t,
new_nranges);
new_array_end = re_realloc (mbcset->range_ends, uint32_t,
- new_nranges);
+ new_nranges);
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
- return REG_ESPACE;
+ return REG_ESPACE;
mbcset->range_starts = new_array_start;
mbcset->range_ends = new_array_end;
*range_alloc = new_nranges;
}
- mbcset->range_starts[mbcset->nranges] = start_collseq;
- mbcset->range_ends[mbcset->nranges++] = end_collseq;
+ mbcset->range_starts[mbcset->nranges] = start_collseq;
+ mbcset->range_ends[mbcset->nranges++] = end_collseq;
}
/* Build the table for single byte characters. */
&start_elem, &end_elem);
#else
# ifdef RE_ENABLE_I18N
- *err = build_range_exp (sbcset,
+ *err = build_range_exp (syntax, sbcset,
dfa->mb_cur_max > 1 ? mbcset : NULL,
&range_alloc, &start_elem, &end_elem);
# else
- *err = build_range_exp (sbcset, &start_elem, &end_elem);
+ *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
# endif
#endif /* RE_ENABLE_I18N */
if (BE (*err != REG_NOERROR, 0))
of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
if (sbc_idx < BITSET_WORDS)
{
- /* Build a tree for simple bracket. */
- br_token.type = SIMPLE_BRACKET;
- br_token.opr.sbcset = sbcset;
- work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
- if (BE (work_tree == NULL, 0))
- goto parse_bracket_exp_espace;
+ /* Build a tree for simple bracket. */
+ br_token.type = SIMPLE_BRACKET;
+ br_token.opr.sbcset = sbcset;
+ work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
+ if (BE (work_tree == NULL, 0))
+ goto parse_bracket_exp_espace;
- /* Then join them by ALT node. */
- work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
- if (BE (work_tree == NULL, 0))
- goto parse_bracket_exp_espace;
+ /* Then join them by ALT node. */
+ work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
+ if (BE (work_tree == NULL, 0))
+ goto parse_bracket_exp_espace;
}
else
{
br_token.opr.sbcset = sbcset;
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
if (BE (work_tree == NULL, 0))
- goto parse_bracket_exp_espace;
+ goto parse_bracket_exp_espace;
}
return work_tree;
/* Build single byte matcing table for this equivalence class. */
char_buf[1] = (unsigned char) '\0';
- len = weights[idx1];
+ len = weights[idx1 & 0xffffff];
for (ch = 0; ch < SBC_MAX; ++ch)
{
char_buf[0] = ch;
if (idx2 == 0)
/* This isn't a valid character. */
continue;
- if (len == weights[idx2])
+ /* Compare only if the length matches and the collation rule
+ index is the same. */
+ if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
{
int cnt = 0;
+
while (cnt <= len &&
- weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
+ weights[(idx1 & 0xffffff) + 1 + cnt]
+ == weights[(idx2 & 0xffffff) + 1 + cnt])
++cnt;
if (cnt > len)
node = node->parent;
dup_node = dup_node->parent;
if (!node)
- return dup_root;
+ return dup_root;
}
node = node->right;
p_new = &dup_node->right;