2 * Amanda, The Advanced Maryland Automatic Network Disk Archiver
3 * Copyright (c) 1991-1998 University of Maryland at College Park
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of U.M. not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. U.M. makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * U.M. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL U.M.
18 * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
23 * Authors: the Amanda Development Team. Its members are listed in a
24 * file named AUTHORS, in the root directory of this distribution.
28 * See match.h for function prototypes and further explanations.
36 * DATA STRUCTURES, MACROS, STATIC DATA
40 * Return codes used by try_match()
44 #define MATCH_NONE (0)
45 #define MATCH_ERROR (-1)
48 * Macro to tell whether a character is a regex metacharacter. Note that '*'
49 * and '?' are NOT included: they are themselves special in globs.
52 #define IS_REGEX_META(c) ( \
53 (c) == '.' || (c) == '(' || (c) == ')' || (c) == '{' || (c) == '}' || \
54 (c) == '+' || (c) == '^' || (c) == '$' || (c) == '|' \
58 * Define a specific type to hold error messages in case regex compile/matching
62 typedef char regex_errbuf[STR_SIZE];
65 * Structure used by amglob_to_regex() to expand particular glob characters. Its
67 * - question_mark: what the question mark ('?') should be replaced with;
68 * - star: what the star ('*') should be replaced with;
69 * - double_star: what two consecutive stars should be replaced with.
71 * Note that apart from double_star, ALL OTHER FIELDS MUST NOT BE NULL.
75 const char *question_mark;
77 const char *double_star;
81 * Susbtitution data for glob_to_regex()
84 static struct subst_table glob_subst_stable = {
85 "[^/]", /* question_mark */
87 NULL /* double_star */
91 * Substitution data for tar_to_regex()
94 static struct subst_table tar_subst_stable = {
95 "[^/]", /* question_mark */
97 NULL /* double_star */
101 * Substitution data for match_word(): dot
104 static struct subst_table mword_dot_subst_table = {
105 "[^.]", /* question_mark */
107 ".*" /* double_star */
111 * Substitution data for match_word(): slash
114 static struct subst_table mword_slash_subst_table = {
115 "[^/]", /* question_mark */
117 ".*" /* double_star */
121 * match_word() specific data:
122 * - re_double_sep: anchored regex matching two separators;
123 * - re_separator: regex matching the separator;
124 * - re_begin_full: regex matching the separator, anchored at the beginning;
125 * - re_end_full: regex matching the separator, andchored at the end.
128 struct mword_regexes {
129 const char *re_double_sep;
130 const char *re_begin_full;
131 const char *re_separator;
132 const char *re_end_full;
135 static struct mword_regexes mword_dot_regexes = {
136 "^\\.\\.$", /* re_double_sep */
137 "^\\.", /* re_begin_full */
138 "\\.", /* re_separator */
139 "\\.$" /* re_end_full */
142 static struct mword_regexes mword_slash_regexes = {
143 "^\\/\\/$", /* re_double_sep */
144 "^\\/", /* re_begin_full */
145 "\\/", /* re_separator */
146 "\\/$" /* re_end_full */
150 * Regular expression caches, and a static mutex to protect initialization and
151 * access. This may be unnecessarily coarse, but it is unknown at this time
152 * whether GHashTable accesses are thread-safe, and get_regex_from_cache() may
153 * be called from within threads, so play it safe.
156 #if (GLIB_MAJOR_VERSION > 2 || (GLIB_MAJOR_VERSION == 2 && GLIB_MINOR_VERSION >= 31))
157 # pragma GCC diagnostic push
158 # pragma GCC diagnostic ignored "-Wmissing-field-initializers"
159 static GStaticMutex re_cache_mutex = G_STATIC_MUTEX_INIT;
160 # pragma GCC diagnostic pop
162 static GStaticMutex re_cache_mutex = G_STATIC_MUTEX_INIT;
164 static GHashTable *regex_cache = NULL, *regex_cache_newline = NULL;
171 * Initialize regex caches. NOTE: this function MUST be called with
172 * re_cache_mutex LOCKED, see get_regex_from_cache()
175 static void init_regex_caches(void)
177 static gboolean initialized = FALSE;
182 regex_cache = g_hash_table_new(g_str_hash, g_str_equal);
183 regex_cache_newline = g_hash_table_new(g_str_hash, g_str_equal);
189 * Cleanup a regular expression by escaping all non alphanumeric characters, and
190 * append beginning/end anchors if need be
193 char *clean_regex(const char *str, gboolean anchor)
198 result = g_malloc(2 * strlen(str) + 3);
204 for (src = str; *src; src++) {
205 if (!g_ascii_isalnum((int) *src))
218 * Compile one regular expression. Return TRUE if the regex has been compiled
219 * successfully. Otherwise, return FALSE and copy the error message into the
220 * supplied regex_errbuf pointer. Also, we want to know whether flags should
221 * include REG_NEWLINE (See regcomp(3) for details). Since this is the more
222 * frequent case, add REG_NEWLINE to the default flags, and remove it only if
223 * match_newline is set to FALSE.
226 static gboolean do_regex_compile(const char *str, regex_t *regex,
227 regex_errbuf *errbuf, gboolean match_newline)
229 int flags = REG_EXTENDED | REG_NOSUB | REG_NEWLINE;
233 flags &= ~REG_NEWLINE;
235 result = regcomp(regex, str, flags);
240 regerror(result, regex, *errbuf, sizeof(*errbuf));
245 * Get an already compiled buffer from the regex cache. If the regex is not in
246 * the cache, allocate a new one and compile it using do_regex_compile(). If the
247 * compile fails, call regfree() on the object and return NULL to the caller. If
248 * it does succeed, put the regex buffer in cache and return a pointer to it.
251 static regex_t *get_regex_from_cache(const char *re_str, regex_errbuf *errbuf,
252 gboolean match_newline)
257 g_static_mutex_lock(&re_cache_mutex);
261 cache = (match_newline) ? regex_cache_newline: regex_cache;
262 ret = g_hash_table_lookup(cache, re_str);
267 ret = g_new(regex_t, 1);
269 if (do_regex_compile(re_str, ret, errbuf, match_newline)) {
270 g_hash_table_insert(cache, g_strdup(re_str), ret);
279 g_static_mutex_unlock(&re_cache_mutex);
284 * Validate one regular expression using do_regex_compile(), and return NULL if
285 * the regex is valid, or the error message otherwise.
288 char *validate_regexp(const char *regex)
291 static regex_errbuf errmsg;
294 valid = do_regex_compile(regex, ®c, &errmsg, TRUE);
297 return (valid) ? NULL : errmsg;
301 * See if a string matches a compiled regular expression. Return one of MATCH_*
302 * defined above. If, for some reason, regexec() returns something other than
303 * not 0 or REG_NOMATCH, return MATCH_ERROR and print the error message in the
304 * supplied regex_errbuf.
307 static int try_match(regex_t *regex, const char *str,
308 regex_errbuf *errbuf)
310 int result = regexec(regex, str, 0, 0, 0);
317 /* Fall through: something went really wrong */
320 regerror(result, regex, *errbuf, sizeof(*errbuf));
325 * Try and match a string against a regular expression, using
326 * do_regex_compile() and try_match(). Exit early if the regex didn't compile
327 * or there was an error during matching.
330 int do_match(const char *regex, const char *str, gboolean match_newline)
336 re = get_regex_from_cache(regex, &errmsg, match_newline);
339 error("regex \"%s\": %s", regex, errmsg);
342 result = try_match(re, str, &errmsg);
344 if (result == MATCH_ERROR)
345 error("regex \"%s\": %s", regex, errmsg);
352 * DISK/HOST EXPRESSION HANDLING
356 * Check whether a given character should be escaped (that is, prepended with a
357 * backslash), EXCEPT for one character.
360 static gboolean should_be_escaped_except(char c, char not_this_one)
362 if (c == not_this_one)
382 * Take a disk/host expression and turn it into a full-blown amglob (with
383 * start and end anchors) following rules in amanda-match(7). The not_this_one
384 * argument represents a character which is NOT meant to be special in this
385 * case: '/' for disks and '.' for hosts.
388 static char *full_amglob_from_expression(const char *str, char not_this_one)
393 result = g_malloc(2 * strlen(str) + 3);
398 for (src = str; *src; src++) {
399 if (should_be_escaped_except(*src, not_this_one))
410 * Turn a disk/host expression into a regex
413 char *make_exact_disk_expression(const char *disk)
415 return full_amglob_from_expression(disk, '/');
418 char *make_exact_host_expression(const char *host)
420 return full_amglob_from_expression(host, '.');
424 * GLOB HANDLING, as per amanda-match(7)
428 * Turn a glob into a regex.
431 static char *amglob_to_regex(const char *str, const char *begin,
432 const char *end, struct subst_table *table)
438 gboolean double_star = (table->double_star != NULL);
441 * There are two particular cases when building a regex out of a glob:
442 * character classes (anything inside [...] or [!...] and quotes (anything
443 * preceded by a backslash). We start with none being true.
446 gboolean in_character_class = FALSE, in_quote = FALSE;
449 * Allocate enough space for our string. At worst, the allocated space is
450 * the length of the following:
451 * - beginning of regex;
452 * - size of original string multiplied by worst-case expansion;
456 * Calculate the worst case expansion by walking our struct subst_table.
459 worst_case = strlen(table->question_mark);
461 if (worst_case < strlen(table->star))
462 worst_case = strlen(table->star);
464 if (double_star && worst_case < strlen(table->double_star))
465 worst_case = strlen(table->double_star);
467 result = g_malloc(strlen(begin) + strlen(str) * worst_case + strlen(end) + 1);
470 * Start by copying the beginning of the regex...
473 dst = g_stpcpy(result, begin);
476 * ... Now to the meat of it.
479 for (src = str; *src; src++) {
483 * First, check that we're in a character class: each and every
484 * character can be copied as is. We only need to be careful is the
485 * character is a closing bracket: it will end the character class IF
486 * AND ONLY IF it is not preceded by a backslash.
489 if (in_character_class) {
490 in_character_class = ((c != ']') || (*(src - 1) == '\\'));
495 * Are we in a quote? If yes, it is really simple: copy the current
496 * character, close the quote, the end.
505 * The only thing left to handle now is the "normal" case: we are not in
506 * a character class nor in a quote.
511 * Backslash: append it, and open a new quote.
515 } else if (c == '[') {
517 * Opening bracket: the beginning of a character class.
519 * Look ahead the next character: if it's an exclamation mark, then
520 * this is a complemented character class; append a caret to make
521 * the result string regex-friendly, and forward one character in
525 in_character_class = TRUE;
526 if (*(src + 1) == '!') {
530 } else if (IS_REGEX_META(c)) {
532 * Regex metacharacter (except for ? and *, see below): append a
533 * backslash, and then the character itself.
539 * Question mark: take the subsitution string out of our subst_table
540 * and append it to the string.
542 dst = g_stpcpy(dst, table->question_mark);
545 * Star: append the subsitution string found in our subst_table.
546 * However, look forward the next character: if it's yet another
547 * star, then see if there is a substitution string for the double
548 * star and append this one instead.
550 * FIXME: this means that two consecutive stars in a glob string
551 * where there is no substition for double_star can lead to
552 * exponential regex execution time: consider [^/]*[^/]*.
554 const char *p = table->star;
555 if (double_star && *(src + 1) == '*') {
557 p = table->double_star;
559 dst = g_stpcpy(dst, p);
562 * Any other character: append each time.
570 * Done, now append the end, ONLY if we are not in a quote - a lone
571 * backslash at the end of a glob is illegal, just leave it as it, it will
572 * make the regex compile fail.
576 dst = g_stpcpy(dst, end);
589 char *glob_to_regex(const char *glob)
591 return amglob_to_regex(glob, "^", "$", &glob_subst_stable);
594 int match_glob(const char *glob, const char *str)
601 regex = glob_to_regex(glob);
602 re = get_regex_from_cache(regex, &errmsg, TRUE);
605 error("glob \"%s\" -> regex \"%s\": %s", glob, regex, errmsg);
608 result = try_match(re, str, &errmsg);
610 if (result == MATCH_ERROR)
611 error("glob \"%s\" -> regex \"%s\": %s", glob, regex, errmsg);
619 char *validate_glob(const char *glob)
621 char *regex, *ret = NULL;
623 static regex_errbuf errmsg;
625 regex = glob_to_regex(glob);
627 if (!do_regex_compile(regex, ®c, &errmsg, TRUE))
639 static char *tar_to_regex(const char *glob)
641 return amglob_to_regex(glob, "(^|/)", "($|/)", &tar_subst_stable);
644 int match_tar(const char *glob, const char *str)
651 regex = tar_to_regex(glob);
652 re = get_regex_from_cache(regex, &errmsg, TRUE);
655 error("glob \"%s\" -> regex \"%s\": %s", glob, regex, errmsg);
658 result = try_match(re, str, &errmsg);
660 if (result == MATCH_ERROR)
661 error("glob \"%s\" -> regex \"%s\": %s", glob, regex, errmsg);
672 * The functions below wrap input strings with separators and attempt to match
673 * the result. The core of the operation is the match_word() function.
677 * Check whether a glob passed as an argument to match_word() only looks for the
681 static gboolean glob_is_separator_only(const char *glob, char sep) {
682 size_t len = strlen(glob);
683 const char len2_1[] = { '^', sep , 0 }, len2_2[] = { sep, '$', 0 },
684 len3[] = { '^', sep, '$', 0 };
688 return (*glob == sep);
690 return !(!g_str_equal(glob, len2_1) && !g_str_equal(glob, len2_2));
692 return g_str_equal(glob, len3);
699 * Given a word and a separator as an argument, wrap the word with separators -
700 * if need be. For instance, if '/' is the separator, the rules are:
704 * - "//" -> left alone
706 * - "/xxx" -> "/xxx/"
707 * - "xxx/" -> "/xxx/"
708 * - "/xxx/" -> left alone
710 * (note that xxx here may contain the separator as well)
712 * Note that the returned string is dynamically allocated: it is up to the
713 * caller to free it. Note also that the first argument MUST NOT BE NULL.
716 static char *wrap_word(const char *word, const char separator, const char *glob)
718 size_t len = strlen(word);
719 size_t len_glob = strlen(glob);
723 * We allocate for the worst case, which is two bytes more than the input
724 * (have to prepend and append a separator).
726 result = g_malloc(len + 3);
730 * Zero-length: separator only
739 * Length is one: if the only character is the separator only, the result
740 * string is two separators
743 if (len == 1 && word[0] == separator) {
750 * Otherwise: prepend the separator if needed, append the separator if
754 if (word[0] != separator && glob[0] != '^')
757 p = g_stpcpy(p, word);
759 if (word[len - 1] != separator && glob[len_glob-1] != '$')
767 static int match_word(const char *glob, const char *word, const char separator)
769 char *wrapped_word = wrap_word(word, separator, glob);
770 struct mword_regexes *regexes = &mword_slash_regexes;
771 struct subst_table *table = &mword_slash_subst_table;
772 gboolean not_slash = (separator != '/');
776 * We only expect two separators: '/' or '.'. If it's not '/', it has to be
780 regexes = &mword_dot_regexes;
781 table = &mword_dot_subst_table;
784 if(glob_is_separator_only(glob, separator)) {
785 ret = do_match(regexes->re_double_sep, wrapped_word, TRUE);
789 * Unlike what happens for tar and disk expressions, we need to
790 * calculate the beginning and end of our regex before calling
794 const char *begin, *end;
795 char *glob_copy = g_strdup(glob);
796 char *p, *g = glob_copy;
800 * Calculate the beginning of the regex:
801 * - by default, it is an unanchored separator;
802 * - if the glob begins with a caret, make that an anchored separator,
803 * and increment g appropriately;
804 * - if it begins with a separator, make it the empty string.
808 begin = regexes->re_separator;
813 if (*p == separator) {
814 begin = regexes->re_begin_full;
817 } else if (*p == separator)
821 * Calculate the end of the regex:
822 * - an unanchored separator by default;
823 * - if the last character is a backslash or the separator itself, it
824 * should be the empty string;
825 * - if it is a dollar sign, overwrite it with 0 and look at the
826 * character before it: if it is the separator, only anchor at the
827 * end, otherwise, add a separator before the anchor.
830 p = &(glob_copy[strlen(glob_copy) - 1]);
831 end = regexes->re_separator;
832 if (*p == '\\' || *p == separator) {
834 } else if (*p == '$') {
835 char prev = *(p - 1);
837 if (prev == separator) {
839 if (p-2 >= glob_copy) {
845 end = regexes->re_end_full;
851 regex = amglob_to_regex(g, begin, end, table);
852 ret = do_match(regex, wrapped_word, TRUE);
859 g_free(wrapped_word);
864 * Match a host expression
867 int match_host(const char *glob, const char *host)
872 lglob = g_ascii_strdown(glob, -1);
873 lhost = g_ascii_strdown(host, -1);
875 ret = match_word(lglob, lhost, '.');
883 * Match a disk expression. Not as straightforward, since Windows paths must be
888 * Convert a disk and glob from Windows expressed paths (backslashes) into Unix
891 * Note: the resulting string is dynamically allocated, it is up to the caller
894 * Note 2: UNC in convert_unc_to_unix stands for Uniform Naming Convention.
897 static char *convert_unc_to_unix(const char *unc)
899 char *result = g_strdup(unc);
900 return g_strdelimit(result, "\\", '/');
903 static char *convert_winglob_to_unix(const char *glob)
907 result = g_malloc(strlen(glob) + 1);
910 for (src = glob; *src; src++) {
911 if (*src == '\\' && *(src + 1) == '\\') {
923 * Match a disk expression
926 int match_disk(const char *glob, const char *disk)
928 char *glob2 = NULL, *disk2 = NULL;
929 const char *g = glob, *d = disk;
933 * Check whether our disk potentially refers to a Windows share (the first
934 * two characters are '\' and there is no / in the word at all): if yes,
935 * build Unix paths instead and pass those as arguments to match_word()
938 gboolean windows_share = !(strncmp(disk, "\\\\", 2) || strchr(disk, '/'));
941 glob2 = convert_winglob_to_unix(glob);
942 disk2 = convert_unc_to_unix(disk);
943 g = (const char *) glob2;
944 d = (const char *) disk2;
947 result = match_word(g, d, '/');
950 * We can g_free(NULL), so this is "safe"
959 * TIMESTAMPS/LEVEL MATCHING
967 if (!isdigit((int)*(str++)))
975 const char * dateexp,
976 const char * datestamp)
979 size_t len, len_suffix;
981 char firstdate[100], lastdate[100];
985 if(strlen(dateexp) >= 100 || strlen(dateexp) < 1) {
989 /* strip and ignore an initial "^" */
990 if(dateexp[0] == '^') {
991 strncpy(mydateexp, dateexp+1, sizeof(mydateexp)-1);
992 mydateexp[sizeof(mydateexp)-1] = '\0';
995 strncpy(mydateexp, dateexp, sizeof(mydateexp)-1);
996 mydateexp[sizeof(mydateexp)-1] = '\0';
999 if(strlen(dateexp) < 1) {
1003 if(mydateexp[strlen(mydateexp)-1] == '$') {
1005 mydateexp[strlen(mydateexp)-1] = '\0'; /* strip the trailing $ */
1010 /* a single dash represents a date range */
1011 if((dash = strchr(mydateexp,'-'))) {
1012 if(match_exact == 1 || strchr(dash+1, '-')) {
1016 /* format: XXXYYYY-ZZZZ, indicating dates XXXYYYY to XXXZZZZ */
1018 len = (size_t)(dash - mydateexp); /* length of XXXYYYY */
1019 len_suffix = strlen(dash) - 1; /* length of ZZZZ */
1020 if (len_suffix > len) goto illegal;
1021 if (len < len_suffix) {
1024 len_prefix = len - len_suffix; /* length of XXX */
1028 strncpy(firstdate, mydateexp, len);
1029 firstdate[len] = '\0';
1030 strncpy(lastdate, mydateexp, len_prefix);
1031 strncpy(&(lastdate[len_prefix]), dash, len_suffix);
1032 lastdate[len] = '\0';
1033 if (!alldigits(firstdate) || !alldigits(lastdate))
1035 if (strncmp(firstdate, lastdate, strlen(firstdate)) > 0)
1037 return ((strncmp(datestamp, firstdate, strlen(firstdate)) >= 0) &&
1038 (strncmp(datestamp, lastdate , strlen(lastdate)) <= 0));
1041 if (!alldigits(mydateexp))
1043 if(match_exact == 1) {
1044 return (g_str_equal(datestamp, mydateexp));
1047 return (g_str_has_prefix(datestamp, mydateexp));
1051 error("Illegal datestamp expression %s", dateexp);
1058 const char * levelexp,
1062 long int low, hi, level_i;
1063 char mylevelexp[100];
1066 if(strlen(levelexp) >= 100 || strlen(levelexp) < 1) {
1067 error("Illegal level expression %s", levelexp);
1071 if(levelexp[0] == '^') {
1072 strncpy(mylevelexp, levelexp+1, strlen(levelexp)-1);
1073 mylevelexp[strlen(levelexp)-1] = '\0';
1074 if (strlen(levelexp) == 0) {
1075 error("Illegal level expression %s", levelexp);
1080 strncpy(mylevelexp, levelexp, strlen(levelexp));
1081 mylevelexp[strlen(levelexp)] = '\0';
1084 if(mylevelexp[strlen(mylevelexp)-1] == '$') {
1086 mylevelexp[strlen(mylevelexp)-1] = '\0';
1091 if((dash = strchr(mylevelexp,'-'))) {
1092 if(match_exact == 1) {
1097 if (!alldigits(mylevelexp) || !alldigits(dash+1)) goto illegal;
1100 low = strtol(mylevelexp, (char **) NULL, 10);
1101 if (errno) goto illegal;
1102 hi = strtol(dash+1, (char **) NULL, 10);
1103 if (errno) goto illegal;
1104 level_i = strtol(level, (char **) NULL, 10);
1105 if (errno) goto illegal;
1107 return ((level_i >= low) && (level_i <= hi));
1110 if (!alldigits(mylevelexp)) goto illegal;
1111 if(match_exact == 1) {
1112 return (g_str_equal(level, mylevelexp));
1115 return (g_str_has_prefix(level, mylevelexp));
1119 error("Illegal level expression %s", levelexp);