2 * Amanda, The Advanced Maryland Automatic Network Disk Archiver
3 * Copyright (c) 1991-1998 University of Maryland at College Park
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of U.M. not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. U.M. makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * U.M. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL U.M.
18 * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
23 * Authors: the Amanda Development Team. Its members are listed in a
24 * file named AUTHORS, in the root directory of this distribution.
27 * $Id: planner.c,v 1.76.2.15.2.13.2.35 2004/05/10 16:43:49 martinea Exp $
29 * backup schedule planner for the Amanda backup system.
42 #include "amfeatures.h"
43 #include "server_util.h"
46 #define MAX_LEVELS 3 /* max# of estimates per filesys */
48 #define RUNS_REDZONE 5 /* should be in conf file? */
50 #define PROMOTE_THRESHOLD 0.05 /* if <5% unbalanced, don't promote */
51 #define DEFAULT_DUMPRATE 30.0 /* K/s */
53 /* configuration file stuff */
59 int conf_runspercycle;
68 #define HOST_READY ((void *)0) /* must be 0 */
69 #define HOST_ACTIVE ((void *)1)
70 #define HOST_DONE ((void *)2)
72 #define DISK_READY 0 /* must be 0 */
76 typedef struct est_s {
82 int degr_level; /* if dump_level == 0, what would be the inc level */
89 double fullrate, incrrate;
90 double fullcomp, incrcomp;
92 int level[MAX_LEVELS];
93 char *dumpdate[MAX_LEVELS];
94 long est_size[MAX_LEVELS];
97 #define est(dp) ((est_t *)(dp)->up)
99 disklist_t startq, waitq, estq, failq, schedq;
101 double total_lev0, balanced_size, balance_threshold;
102 unsigned long tape_length, tape_mark;
103 int result_port, amanda_port;
105 static am_feature_t *our_features = NULL;
106 static char *our_feature_string = NULL;
114 long tt_blocksize_kb;
115 int runs_per_cycle = 0;
120 /* We keep a LIFO queue of before images for all modifications made
121 * to schedq in our attempt to make the schedule fit on the tape.
122 * Enough information is stored to reinstate a dump if it turns out
123 * that it shouldn't have been touched after all.
125 typedef struct bi_s {
128 int deleted; /* 0=modified, 1=deleted */
129 disk_t *dp; /* The disk that was changed */
130 int level; /* The original level */
131 long size; /* The original size */
132 char *errstr; /* A message describing why this disk is here */
135 typedef struct bilist_s {
139 bilist_t biq; /* The BI queue itself */
141 char *datestamp = NULL;
144 * ========================================================================
149 static void setup_estimate P((disk_t *dp));
150 static void get_estimates P((void));
151 static void analyze_estimate P((disk_t *dp));
152 static void handle_failed P((disk_t *dp));
153 static void delay_dumps P((void));
154 static int promote_highest_priority_incremental P((void));
155 static int promote_hills P((void));
156 static void output_scheduleline P((disk_t *dp));
166 unsigned long malloc_hist_1, malloc_size_1;
167 unsigned long malloc_hist_2, malloc_size_2;
174 times_t section_start;
176 for(fd = 3; fd < FD_SETSIZE; fd++) {
178 * Make sure nobody spoofs us with a lot of extra open files
179 * that would cause an open we do to get a very high file
180 * descriptor, which in turn might be used as an index into
181 * an array (e.g. an fd_set).
187 config_name = stralloc(argv[1]);
188 config_dir = vstralloc(CONFIG_DIR, "/", config_name, "/", NULL);
190 char my_cwd[STR_SIZE];
192 if (getcwd(my_cwd, sizeof(my_cwd)) == NULL) {
193 error("cannot determine current working directory");
195 config_dir = stralloc2(my_cwd, "/");
196 if ((config_name = strrchr(my_cwd, '/')) != NULL) {
197 config_name = stralloc(config_name + 1);
203 set_pname("planner");
205 malloc_size_1 = malloc_inuse(&malloc_hist_1);
207 erroutput_type = (ERR_AMANDALOG|ERR_INTERACTIVE);
208 set_logerror(logerror);
210 section_start = curclock();
212 our_features = am_init_feature_set();
213 our_feature_string = am_feature_to_string(our_features);
215 fprintf(stderr, "%s: pid %ld executable %s version %s\n",
216 get_pname(), (long) getpid(), argv[0], version());
217 for(vp = version_info; *vp != NULL; vp++)
218 fprintf(stderr, "%s: %s", get_pname(), *vp);
221 * 1. Networking Setup
223 * Planner runs setuid to get a priviledged socket for BSD security.
224 * We get the socket right away as root, then setuid back to a normal
225 * user. If we are not using BSD security, planner is not installed
229 /* set up dgram port first thing */
233 if(dgram_bind(msg, &result_port) == -1) {
234 error("could not bind result datagram port: %s", strerror(errno));
238 /* set both real and effective uid's to real uid, likewise for gid */
244 * From this point on we are running under our real uid, so we don't
245 * have to worry about opening security holes below. Make sure we
249 if(getpwuid(getuid()) == NULL) {
250 error("can't get login name for my uid %ld", (long)getuid());
254 * 2. Read in Configuration Information
256 * All the Amanda configuration files are loaded before we begin.
259 fprintf(stderr,"READING CONF FILES...\n");
261 conffile = stralloc2(config_dir, CONFFILE_NAME);
262 if(read_conffile(conffile)) {
263 error("errors processing config file \"%s\"", conffile);
266 conf_diskfile = getconf_str(CNF_DISKFILE);
267 if (*conf_diskfile == '/') {
268 conf_diskfile = stralloc(conf_diskfile);
270 conf_diskfile = stralloc2(config_dir, conf_diskfile);
272 if((origqp = read_diskfile(conf_diskfile)) == NULL) {
273 error("could not load disklist \"%s\"", conf_diskfile);
275 match_disklist(origqp, argc-2, argv+2);
276 for(dp = origqp->head; dp != NULL; dp = dp->next) {
278 log_add(L_DISK, "%s %s", dp->host->hostname, dp->name);
280 amfree(conf_diskfile);
281 conf_tapelist = getconf_str(CNF_TAPELIST);
282 if (*conf_tapelist == '/') {
283 conf_tapelist = stralloc(conf_tapelist);
285 conf_tapelist = stralloc2(config_dir, conf_tapelist);
287 if(read_tapelist(conf_tapelist)) {
288 error("could not load tapelist \"%s\"", conf_tapelist);
290 amfree(conf_tapelist);
291 conf_infofile = getconf_str(CNF_INFOFILE);
292 if (*conf_infofile == '/') {
293 conf_infofile = stralloc(conf_infofile);
295 conf_infofile = stralloc2(config_dir, conf_infofile);
297 if(open_infofile(conf_infofile)) {
298 error("could not open info db \"%s\"", conf_infofile);
300 amfree(conf_infofile);
302 conf_tapetype = getconf_str(CNF_TAPETYPE);
303 conf_maxdumpsize = getconf_int(CNF_MAXDUMPSIZE);
304 conf_runtapes = getconf_int(CNF_RUNTAPES);
305 conf_dumpcycle = getconf_int(CNF_DUMPCYCLE);
306 conf_runspercycle = getconf_int(CNF_RUNSPERCYCLE);
307 conf_tapecycle = getconf_int(CNF_TAPECYCLE);
308 conf_bumpdays = getconf_int(CNF_BUMPDAYS);
309 conf_bumpsize = getconf_int(CNF_BUMPSIZE);
310 conf_bumpmult = getconf_real(CNF_BUMPMULT);
311 conf_etimeout = getconf_int(CNF_ETIMEOUT);
312 conf_reserve = getconf_int(CNF_RESERVE);
313 conf_autoflush = getconf_int(CNF_AUTOFLUSH);
317 datestamp = construct_datestamp(NULL);
318 log_add(L_START, "date %s", datestamp);
320 /* some initializations */
322 if(conf_runspercycle == 0) {
323 runs_per_cycle = conf_dumpcycle;
324 } else if(conf_runspercycle == -1 ) {
325 runs_per_cycle = guess_runs_from_tapelist();
327 runs_per_cycle = conf_runspercycle;
329 if (runs_per_cycle <= 0) {
334 * do some basic sanity checking
336 if(conf_tapecycle <= runs_per_cycle) {
337 log_add(L_WARNING, "tapecycle (%d) <= runspercycle (%d)",
338 conf_tapecycle, runs_per_cycle);
341 tape = lookup_tapetype(conf_tapetype);
342 if(conf_maxdumpsize > 0) {
343 tape_length = conf_maxdumpsize;
346 tape_length = tape->length * conf_runtapes;
348 tape_mark = tape->filemark;
349 tt_blocksize_kb = tape->blocksize;
350 tt_blocksize = tt_blocksize_kb * 1024;
352 proto_init(msg->socket, today, 1000); /* XXX handles should eq nhosts */
355 kerberos_service_init();
358 fprintf(stderr, "%s: time %s: startup took %s secs\n",
360 walltime_str(curclock()),
361 walltime_str(timessub(curclock(), section_start)));
364 * 3. Calculate Preliminary Dump Levels
366 * Before we can get estimates from the remote slave hosts, we make a
367 * first attempt at guessing what dump levels we will be dumping at
368 * based on the curinfo database.
371 fprintf(stderr,"\nSETTING UP FOR ESTIMATES...\n");
372 section_start = curclock();
374 startq.head = startq.tail = NULL;
375 while(!empty(*origqp)) {
376 disk_t *dp = dequeue_disk(origqp);
382 fprintf(stderr, "%s: time %s: setting up estimates took %s secs\n",
384 walltime_str(curclock()),
385 walltime_str(timessub(curclock(), section_start)));
389 * 4. Get Dump Size Estimates from Remote Client Hosts
391 * Each host is queried (in parallel) for dump size information on all
392 * of its disks, and the results gathered as they come in.
395 /* go out and get the dump estimates */
397 fprintf(stderr,"\nGETTING ESTIMATES...\n");
398 section_start = curclock();
400 estq.head = estq.tail = NULL;
401 failq.head = failq.tail = NULL;
405 fprintf(stderr, "%s: time %s: getting estimates took %s secs\n",
407 walltime_str(curclock()),
408 walltime_str(timessub(curclock(), section_start)));
411 * At this point, all disks with estimates are in estq, and
412 * all the disks on hosts that didn't respond to our inquiry
416 dump_queue("FAILED", failq, 15, stderr);
417 dump_queue("DONE", estq, 15, stderr);
421 * 5. Analyze Dump Estimates
423 * Each disk's estimates are looked at to determine what level it
424 * should dump at, and to calculate the expected size and time taking
425 * historical dump rates and compression ratios into account. The
426 * total expected size is accumulated as well.
429 fprintf(stderr,"\nANALYZING ESTIMATES...\n");
430 section_start = curclock();
432 /* an empty tape still has a label and an endmark */
433 total_size = (tt_blocksize_kb + tape_mark) * 2;
437 schedq.head = schedq.tail = NULL;
438 while(!empty(estq)) analyze_estimate(dequeue_disk(&estq));
439 while(!empty(failq)) handle_failed(dequeue_disk(&failq));
442 * At this point, all the disks are on schedq sorted by priority.
443 * The total estimated size of the backups is in total_size.
449 fprintf(stderr, "INITIAL SCHEDULE (size %ld):\n", total_size);
450 for(dp = schedq.head; dp != NULL; dp = dp->next) {
451 fprintf(stderr, " %s %s pri %d lev %d size %ld\n",
452 dp->host->hostname, dp->name, est(dp)->dump_priority,
453 est(dp)->dump_level, est(dp)->dump_size);
459 * 6. Delay Dumps if Schedule Too Big
461 * If the generated schedule is too big to fit on the tape, we need to
462 * delay some full dumps to make room. Incrementals will be done
463 * instead (except for new or forced disks).
465 * In extreme cases, delaying all the full dumps is not even enough.
466 * If so, some low-priority incrementals will be skipped completely
467 * until the dumps fit on the tape.
471 "\nDELAYING DUMPS IF NEEDED, total_size %ld, tape length %lu mark %lu\n",
472 total_size, tape_length, tape_mark);
474 initial_size = total_size;
478 /* XXX - why bother checking this? */
479 if(empty(schedq) && total_size < initial_size)
480 error("cannot fit anything on tape, bailing out");
484 * 7. Promote Dumps if Schedule Too Small
486 * Amanda attempts to balance the full dumps over the length of the
487 * dump cycle. If this night's full dumps are too small relative to
488 * the other nights, promote some high-priority full dumps that will be
489 * due for the next run, to full dumps for tonight, taking care not to
490 * overflow the tape size.
492 * This doesn't work too well for small sites. For these we scan ahead
493 * looking for nights that have an excessive number of dumps and promote
496 * Amanda never delays full dumps just for the sake of balancing the
497 * schedule, so it can take a full cycle to balance the schedule after
502 "\nPROMOTING DUMPS IF NEEDED, total_lev0 %1.0f, balanced_size %1.0f...\n",
503 total_lev0, balanced_size);
505 balance_threshold = balanced_size * PROMOTE_THRESHOLD;
507 while((balanced_size - total_lev0) > balance_threshold && moved_one)
508 moved_one = promote_highest_priority_incremental();
510 moved_one = promote_hills();
512 fprintf(stderr, "%s: time %s: analysis took %s secs\n",
514 walltime_str(curclock()),
515 walltime_str(timessub(curclock(), section_start)));
521 * The schedule goes to stdout, presumably to driver. A copy is written
522 * on stderr for the debug file.
525 fprintf(stderr,"\nGENERATING SCHEDULE:\n--------\n");
531 holding_list = get_flush(NULL, datestamp, 0, 0);
532 for(holding_file=holding_list->first; holding_file != NULL;
533 holding_file = holding_file->next) {
534 get_dumpfile(holding_file->name, &file);
536 log_add(L_DISK, "%s %s", file.name, file.disk);
538 "FLUSH %s %s %s %d %s\n",
545 "FLUSH %s %s %s %d %s\n",
552 free_sl(holding_list);
555 fprintf(stderr, "ENDFLUSH\n");
556 fprintf(stdout, "ENDFLUSH\n");
559 while(!empty(schedq)) output_scheduleline(dequeue_disk(&schedq));
560 fprintf(stderr, "--------\n");
563 log_add(L_FINISH, "date %s", datestamp);
569 amfree(our_feature_string);
570 am_release_feature_set(our_features);
573 malloc_size_2 = malloc_inuse(&malloc_hist_2);
575 if(malloc_size_1 != malloc_size_2) {
576 malloc_list(fileno(stderr), malloc_hist_1, malloc_hist_2);
585 * ========================================================================
586 * SETUP FOR ESTIMATES
590 static int last_level P((info_t *info)); /* subroutines */
591 static long est_size P((disk_t *dp, int level));
592 static long est_tape_size P((disk_t *dp, int level));
593 static int next_level0 P((disk_t *dp, info_t *info));
594 static int runs_at P((info_t *info, int lev));
595 static long bump_thresh P((int level));
596 static int when_overwrite P((char *label));
598 static void askfor(ep, seq, lev, info)
599 est_t *ep; /* esimate data block */
600 int seq; /* sequence number of request */
601 int lev; /* dump level being requested */
602 info_t *info; /* info block for disk */
606 if(seq < 0 || seq >= MAX_LEVELS) {
607 error("error [planner askfor: seq out of range 0..%d: %d]",
610 if(lev < -1 || lev >= DUMP_LEVELS) {
611 error("error [planner askfor: lev out of range -1..%d: %d]",
617 ep->dumpdate[seq] = (char *)0;
618 ep->est_size[seq] = -1;
622 ep->level[seq] = lev;
624 ep->dumpdate[seq] = stralloc(get_dumpdate(info,lev));
625 malloc_mark(ep->dumpdate[seq]);
627 stat = &info->inf[lev];
628 if(stat->date == EPOCH) ep->est_size[seq] = -1;
629 else ep->est_size[seq] = stat->size;
642 assert(dp && dp->host);
643 fprintf(stderr, "%s: time %s: setting up estimates for %s:%s\n",
644 get_pname(), walltime_str(curclock()),
645 dp->host->hostname, dp->name);
647 /* get current information about disk */
649 if(get_info(dp->host->hostname, dp->name, &info)) {
650 /* no record for this disk, make a note of it */
651 log_add(L_INFO, "Adding new disk %s:%s.", dp->host->hostname, dp->name);
654 /* setup working data struct for disk */
656 ep = alloc(sizeof(est_t));
658 dp->up = (void *) ep;
659 ep->state = DISK_READY;
661 ep->dump_priority = dp->priority;
665 /* calculated fields */
667 if(info.command & FORCE_FULL) {
668 /* force a level 0, kind of like a new disk */
669 if(dp->strategy == DS_NOFULL) {
671 * XXX - Not sure what it means to force a no-full disk. The
672 * purpose of no-full is to just dump changes relative to a
673 * stable base, for example root partitions that vary only
674 * slightly from a site-wide prototype. Only the variations
677 * If we allow a level 0 onto the Amanda cycle, then we are
678 * hosed when that tape gets re-used next. Disallow this for
682 "Cannot force full dump of %s:%s with no-full option.",
683 dp->host->hostname, dp->name);
685 /* clear force command */
686 if(info.command & FORCE_FULL)
687 info.command ^= FORCE_FULL;
688 if(put_info(dp->host->hostname, dp->name, &info))
689 error("could not put info record for %s:%s: %s",
690 dp->host->hostname, dp->name, strerror(errno));
691 ep->last_level = last_level(&info);
692 ep->next_level0 = next_level0(dp, &info);
696 ep->next_level0 = -conf_dumpcycle;
697 log_add(L_INFO, "Forcing full dump of %s:%s as directed.",
698 dp->host->hostname, dp->name);
701 else if(dp->strategy == DS_NOFULL) {
702 /* force estimate of level 1 */
704 ep->next_level0 = next_level0(dp, &info);
707 ep->last_level = last_level(&info);
708 ep->next_level0 = next_level0(dp, &info);
711 /* adjust priority levels */
713 if(ep->next_level0 < 0) {
714 fprintf(stderr,"%s:%s overdue %d day%s for level 0\n",
715 dp->host->hostname, dp->name,
716 - ep->next_level0, ((- ep->next_level0) == 1) ? "" : "s");
717 ep->dump_priority -= ep->next_level0;
718 /* warn if dump will be overwritten */
719 if(ep->last_level > -1) {
720 int overwrite_runs = when_overwrite(info.inf[0].label);
721 if(overwrite_runs == 0) {
723 "Last full dump of %s:%s on tape %s overwritten on this run.",
724 dp->host->hostname, dp->name, info.inf[0].label);
726 else if(overwrite_runs < RUNS_REDZONE) {
728 "Last full dump of %s:%s on tape %s overwritten in %d run%s.",
729 dp->host->hostname, dp->name, info.inf[0].label,
730 overwrite_runs, overwrite_runs == 1? "" : "s");
734 else if(info.command & FORCE_FULL)
735 ep->dump_priority += 1;
736 /* else XXX bump up the priority of incrementals that failed last night */
738 /* handle external level 0 dumps */
740 if(dp->skip_full && dp->strategy != DS_NOINC) {
741 if(ep->next_level0 <= 0) {
742 /* update the date field */
743 info.inf[0].date = today;
744 if(info.command & FORCE_FULL)
745 info.command ^= FORCE_FULL;
746 ep->next_level0 += conf_dumpcycle;
748 if(put_info(dp->host->hostname, dp->name, &info))
749 error("could not put info record for %s:%s: %s",
750 dp->host->hostname, dp->name, strerror(errno));
751 log_add(L_INFO, "Skipping full dump of %s:%s today.",
752 dp->host->hostname, dp->name);
753 fprintf(stderr,"%s:%s lev 0 skipped due to skip-full flag\n",
754 dp->host->hostname, dp->name);
755 /* don't enqueue the disk */
756 askfor(ep, 0, -1, &info);
757 askfor(ep, 1, -1, &info);
758 askfor(ep, 2, -1, &info);
759 fprintf(stderr, "%s: SKIPPED %s %s 0 [skip-full]\n",
760 get_pname(), dp->host->hostname, dp->name);
761 log_add(L_SUCCESS, "%s %s %s 0 [skipped: skip-full]",
762 dp->host->hostname, dp->name, datestamp);
766 if(ep->last_level == -1) {
767 /* probably a new disk, but skip-full means no full! */
771 if(ep->next_level0 == 1) {
772 log_add(L_WARNING, "Skipping full dump of %s:%s tomorrow.",
773 dp->host->hostname, dp->name);
777 /* handle "skip-incr" type archives */
779 if(dp->skip_incr && ep->next_level0 > 0) {
780 fprintf(stderr,"%s:%s lev 1 skipped due to skip-incr flag\n",
781 dp->host->hostname, dp->name);
782 /* don't enqueue the disk */
783 askfor(ep, 0, -1, &info);
784 askfor(ep, 1, -1, &info);
785 askfor(ep, 2, -1, &info);
787 fprintf(stderr, "%s: SKIPPED %s %s 1 [skip-incr]\n",
788 get_pname(), dp->host->hostname, dp->name);
790 log_add(L_SUCCESS, "%s %s %s 1 [skipped: skip-incr]",
791 dp->host->hostname, dp->name, datestamp);
795 if( ep->last_level == -1 && ep->next_level0 > 0 &&
796 dp->strategy != DS_NOFULL && dp->strategy != DS_INCRONLY &&
797 conf_reserve == 100) {
799 "%s:%s mismatch: no tapelist record, but curinfo next_level0: %d.",
800 dp->host->hostname, dp->name, ep->next_level0);
804 if(ep->last_level == 0) ep->level_days = 0;
805 else ep->level_days = runs_at(&info, ep->last_level);
806 ep->last_lev0size = info.inf[0].csize;
808 ep->fullrate = perf_average(info.full.rate, 0.0);
809 ep->incrrate = perf_average(info.incr.rate, 0.0);
811 ep->fullcomp = perf_average(info.full.comp, dp->comprate[0]);
812 ep->incrcomp = perf_average(info.incr.comp, dp->comprate[1]);
814 /* determine which estimates to get */
818 if(dp->strategy == DS_NOINC ||
820 (!(info.command & FORCE_BUMP) ||
822 ep->last_level == -1))){
824 if(info.command & FORCE_BUMP && ep->last_level == -1) {
826 "Remove force-bump command of %s:%s because it's a new disk.",
827 dp->host->hostname, dp->name);
829 switch (dp->strategy) {
832 askfor(ep, i++, 0, &info);
835 "Ignoring skip_full for %s:%s because the strategy is NOINC.",
836 dp->host->hostname, dp->name);
838 if(info.command & FORCE_BUMP) {
840 "Ignoring FORCE_BUMP for %s:%s because the strategy is NOINC.",
841 dp->host->hostname, dp->name);
850 if (info.command & FORCE_FULL)
851 askfor(ep, i++, 0, &info);
856 if(!dp->skip_incr && !(dp->strategy == DS_NOINC)) {
857 if(ep->last_level == -1) { /* a new disk */
858 if(dp->strategy == DS_NOFULL || dp->strategy == DS_INCRONLY) {
859 askfor(ep, i++, 1, &info);
861 assert(!dp->skip_full); /* should be handled above */
863 } else { /* not new, pick normally */
866 curr_level = ep->last_level;
868 if(info.command & FORCE_NO_BUMP) {
869 if(curr_level > 0) { /* level 0 already asked for */
870 askfor(ep, i++, curr_level, &info);
872 log_add(L_INFO,"Preventing bump of %s:%s as directed.",
873 dp->host->hostname, dp->name);
875 else if((info.command & FORCE_BUMP)
876 && curr_level + 1 < DUMP_LEVELS) {
877 askfor(ep, i++, curr_level+1, &info);
878 log_add(L_INFO,"Bumping of %s:%s at level %d as directed.",
879 dp->host->hostname, dp->name, curr_level+1);
881 else if(curr_level == 0) {
882 askfor(ep, i++, 1, &info);
885 askfor(ep, i++, curr_level, &info);
887 * If last time we dumped less than the threshold, then this
888 * time we will too, OR the extra size will be charged to both
889 * cur_level and cur_level + 1, so we will never bump. Also,
890 * if we haven't been at this level 2 days, or the dump failed
891 * last night, we can't bump.
893 if((info.inf[curr_level].size == 0 || /* no data, try it anyway */
894 (((info.inf[curr_level].size > bump_thresh(curr_level)))
895 && ep->level_days >= conf_bumpdays))
896 && curr_level + 1 < DUMP_LEVELS) {
897 askfor(ep, i++, curr_level+1, &info);
903 while(i < MAX_LEVELS) /* mark end of estimates */
904 askfor(ep, i++, -1, &info);
908 fprintf(stderr, "setup_estimate: %s:%s: command %d, options:",
909 dp->host->hostname, dp->name, info.command);
910 if(dp->strategy == DS_NOFULL) fputs(" no-full", stderr);
911 if(dp->strategy == DS_INCRONLY) fputs(" incr-only", stderr);
912 if(dp->skip_full) fputs(" skip-full", stderr);
913 if(dp->skip_incr) fputs(" skip-incr", stderr);
914 fprintf(stderr, "\n last_level %d next_level0 %d level_days %d\n",
915 ep->last_level, ep->next_level0, ep->level_days);
916 fprintf(stderr, " getting estimates %d (%ld) %d (%ld) %d (%ld)\n",
917 ep->level[0], ep->est_size[0],
918 ep->level[1], ep->est_size[1],
919 ep->level[2], ep->est_size[2]);
921 assert(ep->level[0] != -1);
922 enqueue_disk(&startq, dp);
925 static int when_overwrite(label)
930 if((tp = lookup_tapelabel(label)) == NULL)
931 return 1; /* "shouldn't happen", but trigger warning message */
932 else if(!reusable_tape(tp))
934 else if(lookup_nb_tape() > conf_tapecycle)
935 return (lookup_nb_tape() - tp->position) / conf_runtapes;
937 return (conf_tapecycle - tp->position) / conf_runtapes;
940 /* Return the estimated size for a particular dump */
941 static long est_size(dp, level)
947 for(i = 0; i < MAX_LEVELS; i++) {
948 if(level == est(dp)->level[i])
949 return est(dp)->est_size[i];
954 /* Return the estimated on-tape size of a particular dump */
955 static long est_tape_size(dp, level)
962 size = est_size(dp, level);
964 if(size == -1) return size;
966 if(dp->compress == COMP_NONE)
969 if(level == 0) ratio = est(dp)->fullcomp;
970 else ratio = est(dp)->incrcomp;
973 * make sure over-inflated compression ratios don't throw off the
974 * estimates, this is mostly for when you have a small dump getting
975 * compressed which takes up alot more disk/tape space relatively due
976 * to the overhead of the compression. This is specifically for
977 * Digital Unix vdump. This patch is courtesy of Rudolf Gabler
978 * (RUG@USM.Uni-Muenchen.DE)
981 if(ratio > 1.1) ratio = 1.1;
986 * Ratio can be very small in some error situations, so make sure
987 * size goes back greater than zero. It may not be right, but
988 * indicates we did get an estimate.
998 /* what was the level of the last successful dump to tape? */
999 static int last_level(info)
1002 int min_pos, min_level, i;
1003 time_t lev0_date, last_date;
1006 if(info->last_level != -1)
1007 return info->last_level;
1009 /* to keep compatibility with old infofile */
1010 min_pos = 1000000000;
1014 for(i = 0; i < 9; i++) {
1015 if(conf_reserve < 100) {
1016 if(i == 0) lev0_date = info->inf[0].date;
1017 else if(info->inf[i].date < lev0_date) continue;
1018 if(info->inf[i].date > last_date) {
1019 last_date = info->inf[i].date;
1024 if((tp = lookup_tapelabel(info->inf[i].label)) == NULL) continue;
1025 /* cull any entries from previous cycles */
1026 if(i == 0) lev0_date = info->inf[0].date;
1027 else if(info->inf[i].date < lev0_date) continue;
1029 if(tp->position < min_pos) {
1030 min_pos = tp->position;
1035 info->last_level = i;
1039 /* when is next level 0 due? 0 = today, 1 = tomorrow, etc*/
1041 next_level0(dp, info)
1045 if(dp->strategy == DS_NOFULL || dp->strategy == DS_INCRONLY)
1046 return 1; /* fake it */
1047 else if (dp->strategy == DS_NOINC)
1049 else if(info->inf[0].date < (time_t)0)
1050 return -days_diff(EPOCH, today); /* new disk */
1052 return dp->dumpcycle - days_diff(info->inf[0].date, today);
1055 /* how many runs at current level? */
1056 static int runs_at(info, lev)
1060 tape_t *cur_tape, *old_tape;
1063 last = last_level(info);
1064 if(lev != last) return 0;
1065 if(lev == 0) return 1;
1067 if(info->consecutive_runs != -1)
1068 return info->consecutive_runs;
1070 /* to keep compatibility with old infofile */
1071 cur_tape = lookup_tapelabel(info->inf[lev].label);
1072 old_tape = lookup_tapelabel(info->inf[lev-1].label);
1073 if(cur_tape == NULL || old_tape == NULL) return 0;
1075 nb_runs = (old_tape->position - cur_tape->position) / conf_runtapes;
1076 info->consecutive_runs = nb_runs;
1082 static long bump_thresh(level)
1087 bump = conf_bumpsize;
1088 while(--level) bump = bump * conf_bumpmult;
1096 * ========================================================================
1097 * GET REMOTE DUMP SIZE ESTIMATES
1101 static void getsize P((host_t *hostp));
1102 static disk_t *lookup_hostdisk P((host_t *hp, char *str));
1103 static void handle_result P((proto_t *p, pkt_t *pkt));
1106 static void get_estimates P((void))
1110 struct servent *amandad;
1111 int something_started;
1113 if((amandad = getservbyname(AMANDA_SERVICE_NAME, "udp")) == NULL)
1114 amanda_port = AMANDA_SERVICE_DEFAULT;
1116 amanda_port = ntohs(amandad->s_port);
1118 #ifdef KRB4_SECURITY
1119 if((amandad = getservbyname(KAMANDA_SERVICE_NAME, "udp")) == NULL)
1120 kamanda_port = KAMANDA_SERVICE_DEFAULT;
1122 kamanda_port = ntohs(amandad->s_port);
1125 something_started = 1;
1126 while(something_started) {
1127 something_started = 0;
1128 for(dp = startq.head; dp != NULL; dp = dp->next) {
1130 if(hostp->up == HOST_READY) {
1131 something_started = 1;
1135 * dp is no longer on startq, so dp->next is not valid
1136 * and we have to start all over.
1144 while(!empty(waitq)) {
1145 disk_t *dp = dequeue_disk(&waitq);
1146 est(dp)->errstr = "hmm, disk was stranded on waitq";
1147 enqueue_disk(&failq, dp);
1151 static void getsize(hostp)
1156 char *req = NULL, *errstr = NULL;
1157 int i, estimates, rc, timeout, disk_state, req_len;
1158 char number[NUM_STR_SIZE];
1160 assert(hostp->disks != NULL);
1162 if(hostp->up != HOST_READY) {
1167 * The first time through here we send a "noop" request. This will
1168 * return the feature list from the client if it supports that.
1169 * If it does not, handle_result() will set the feature list to an
1170 * empty structure. In either case, we do the disks on the second
1171 * (and subsequent) pass(es).
1174 if(hostp->features != NULL) { /* sendsize service */
1175 int has_features = am_has_feature(hostp->features,
1176 fe_req_options_features);
1177 int has_hostname = am_has_feature(hostp->features,
1178 fe_req_options_hostname);
1179 int has_maxdumps = am_has_feature(hostp->features,
1180 fe_req_options_maxdumps);
1182 ap_snprintf(number, sizeof(number), "%d", hostp->maxdumps);
1183 req = vstralloc("SERVICE ", "sendsize", "\n",
1185 has_features ? "features=" : "",
1186 has_features ? our_feature_string : "",
1187 has_features ? ";" : "",
1188 has_maxdumps ? "maxdumps=" : "",
1189 has_maxdumps ? number : "",
1190 has_maxdumps ? ";" : "",
1191 has_hostname ? "hostname=" : "",
1192 has_hostname ? hostp->hostname : "",
1193 has_hostname ? ";" : "",
1196 req_len = strlen(req);
1197 req_len += 128; /* room for SECURITY ... */
1199 for(dp = hostp->disks; dp != NULL; dp = dp->hostnext) {
1203 if(dp->todo == 0) continue;
1205 if(est(dp)->state != DISK_READY) {
1208 est(dp)->got_estimate = 0;
1209 if(est(dp)->level[0] == -1) {
1210 est(dp)->state = DISK_DONE;
1211 continue; /* ignore this disk */
1214 for(i = 0; i < MAX_LEVELS; i++) {
1216 char *exclude1 = "";
1217 char *exclude2 = "";
1218 char *excludefree = NULL;
1219 char spindle[NUM_STR_SIZE];
1220 char level[NUM_STR_SIZE];
1221 int lev = est(dp)->level[i];
1223 if(lev == -1) break;
1225 ap_snprintf(level, sizeof(level), "%d", lev);
1226 ap_snprintf(spindle, sizeof(spindle), "%d", dp->spindle);
1227 if(am_has_feature(hostp->features, fe_sendsize_req_options)) {
1228 exclude1 = " OPTIONS |";
1229 exclude2 = optionstr(dp, hostp->features, NULL);
1230 excludefree = exclude2;
1233 if(dp->exclude_file && dp->exclude_file->nb_element == 1) {
1234 exclude1 = " exclude-file=";
1235 exclude2 = dp->exclude_file->first->name;
1237 else if(dp->exclude_list
1238 && dp->exclude_list->nb_element == 1) {
1239 exclude1 = " exclude-list=";
1240 exclude2 = dp->exclude_list->first->name;
1244 l = vstralloc(dp->program, " ",
1248 est(dp)->dumpdate[i], " ", spindle,
1255 l = vstralloc(dp->program, " ", dp->name, " ", level, " ",
1256 est(dp)->dumpdate[i], " ", spindle,
1262 amfree(excludefree);
1268 * Allow 2X for err response.
1270 if(req_len + s_len > MAX_DGRAM / 2) {
1278 est(dp)->state = DISK_ACTIVE;
1279 remove_disk(&startq, dp);
1282 if(estimates == 0) {
1284 hostp->up = HOST_DONE;
1288 if (conf_etimeout < 0) {
1289 timeout = - conf_etimeout;
1291 timeout = estimates * conf_etimeout;
1293 } else { /* noop service */
1294 req = vstralloc("SERVICE ", "noop", "\n",
1296 "features=", our_feature_string, ";",
1300 * We use ctimeout for the "noop" request because it should be
1301 * very fast and etimeout has other side effects.
1303 timeout = getconf_int(CNF_CTIMEOUT);
1306 #ifdef KRB4_SECURITY
1307 if(hostp->disks->auth == AUTH_KRB4)
1308 rc = make_krb_request(hostp->hostname, kamanda_port, req,
1309 hostp, timeout, handle_result);
1312 rc = make_request(hostp->hostname, amanda_port, req,
1313 hostp, timeout, handle_result);
1315 req = NULL; /* do not own this any more */
1318 errstr = vstralloc("could not resolve hostname \"",
1323 hostp->up = HOST_DONE;
1324 disk_state = DISK_DONE;
1329 hostp->up = HOST_ACTIVE;
1330 disk_state = DISK_ACTIVE;
1333 for(dp = hostp->disks; dp != NULL; dp = dp->hostnext) {
1334 if(dp->todo && est(dp)->state == DISK_ACTIVE) {
1335 est(dp)->state = disk_state;
1336 est(dp)->errstr = errstr;
1338 enqueue_disk(destqp, dp);
1344 static disk_t *lookup_hostdisk(hp, str)
1350 for(dp = hp->disks; dp != NULL; dp = dp->hostnext)
1351 if(strcmp(str, dp->name) == 0) return dp;
1357 static void handle_result(p, pkt)
1365 char *msgdisk=NULL, *msgdisk_undo=NULL, msgdisk_undo_ch = '\0';
1366 char *errbuf = NULL;
1374 hostp = (host_t *) p->datap;
1375 hostp->up = HOST_READY;
1377 if(p->state == S_FAILED && pkt == NULL) {
1378 if(p->prevstate == S_REPWAIT) {
1379 errbuf = vstralloc("Estimate timeout from ", hostp->hostname,
1383 errbuf = vstralloc("Request to ", hostp->hostname, " timed out.",
1390 fprintf(stderr, "got %sresponse from %s:\n----\n%s----\n\n",
1391 (p->state == S_FAILED) ? "NAK " : "", hostp->hostname, pkt->body);
1394 #ifdef KRB4_SECURITY
1395 if(hostp->disks->auth == AUTH_KRB4 &&
1396 !check_mutual_authenticator(host2key(hostp->hostname), pkt, p)) {
1397 errbuf = vstralloc(hostp->hostname,
1398 "[mutual-authentication failed]",
1404 msgdisk_undo = NULL;
1410 if (s[-2] == '\n') {
1414 #define sc "OPTIONS "
1415 if(strncmp(line, sc, sizeof(sc)-1) == 0) {
1418 #define sc "features="
1419 t = strstr(line, sc);
1420 if(t != NULL && (isspace((int)t[-1]) || t[-1] == ';')) {
1423 am_release_feature_set(hostp->features);
1424 if((hostp->features = am_string_to_feature(t)) == NULL) {
1425 errbuf = vstralloc(hostp->hostname,
1426 ": bad features value: ",
1438 if(strncmp(line, sc, sizeof(sc)-1) == 0) {
1439 t = line + sizeof(sc)-1;
1444 skip_whitespace(t, tch);
1449 * If the "error" is that the "noop" service is unknown, it
1450 * just means the client is "old" (does not support the servie).
1451 * We can ignore this.
1453 if(hostp->features == NULL
1454 && p->state == S_FAILED
1455 && (strcmp(t - 1, "unknown service: noop") == 0
1456 || strcmp(t - 1, "noop: invalid service") == 0)) {
1459 errbuf = vstralloc(hostp->hostname,
1460 (p->state == S_FAILED) ? "NAK " : "",
1470 skip_non_whitespace(t, tch);
1471 msgdisk_undo = t - 1;
1472 msgdisk_undo_ch = *msgdisk_undo;
1473 *msgdisk_undo = '\0';
1475 skip_whitespace(t, tch);
1476 if (sscanf(t - 1, "%d SIZE %ld", &level, &size) != 2) {
1480 dp = lookup_hostdisk(hostp, msgdisk);
1482 *msgdisk_undo = msgdisk_undo_ch; /* for error message */
1483 msgdisk_undo = NULL;
1486 log_add(L_ERROR, "%s: invalid reply from sendsize: `%s'\n",
1487 hostp->hostname, line);
1489 for(i = 0; i < MAX_LEVELS; i++) {
1490 if(est(dp)->level[i] == level) {
1491 est(dp)->est_size[i] = size;
1495 if(i == MAX_LEVELS) {
1496 goto bad_msg; /* this est wasn't requested */
1498 est(dp)->got_estimate++;
1502 if(hostp->up == HOST_READY && hostp->features == NULL) {
1504 * The client does not support the features list, so give it an
1507 dbprintf(("%s: no feature set from host %s\n",
1508 debug_prefix_time(NULL), hostp->hostname));
1509 hostp->features = am_set_default_feature_set();
1512 /* XXX what about disks that only got some estimates... do we care? */
1513 /* XXX amanda 2.1 treated that case as a bad msg */
1515 for(dp = hostp->disks; dp != NULL; dp = dp->hostnext) {
1516 if(dp->todo == 0) continue;
1517 if(est(dp)->state != DISK_ACTIVE) continue;
1518 est(dp)->state = DISK_DONE;
1519 if(est(dp)->level[0] == -1) continue; /* ignore this disk */
1520 remove_disk(&waitq, dp);
1521 if(est(dp)->got_estimate) {
1522 fprintf(stderr,"%s: time %s: got result for host %s disk %s:",
1523 get_pname(), walltime_str(curclock()),
1524 dp->host->hostname, dp->name);
1525 fprintf(stderr," %d -> %ldK, %d -> %ldK, %d -> %ldK\n",
1526 est(dp)->level[0], est(dp)->est_size[0],
1527 est(dp)->level[1], est(dp)->est_size[1],
1528 est(dp)->level[2], est(dp)->est_size[2]);
1530 if((est(dp)->level[0] != -1 && est(dp)->est_size[0] > 0) ||
1531 (est(dp)->level[1] != -1 && est(dp)->est_size[1] > 0) ||
1532 (est(dp)->level[2] != -1 && est(dp)->est_size[2] > 0)) {
1534 if(est(dp)->level[2] != -1 && est(dp)->est_size[2] < 0) {
1536 "disk %s:%s, estimate of level %d failed: %d.",
1537 dp->host->hostname, dp->name,
1538 est(dp)->level[2], est(dp)->est_size[2]);
1539 est(dp)->level[2] = -1;
1541 if(est(dp)->level[1] != -1 && est(dp)->est_size[1] < 0) {
1543 "disk %s:%s, estimate of level %d failed: %d.",
1544 dp->host->hostname, dp->name,
1545 est(dp)->level[1], est(dp)->est_size[1]);
1546 est(dp)->level[1] = -1;
1548 if(est(dp)->level[0] != -1 && est(dp)->est_size[0] < 0) {
1550 "disk %s:%s, estimate of level %d failed: %d.",
1551 dp->host->hostname, dp->name,
1552 est(dp)->level[0], est(dp)->est_size[0]);
1553 est(dp)->level[0] = -1;
1556 enqueue_disk(&estq, dp);
1559 enqueue_disk(&failq, dp);
1560 est(dp)->errstr = vstralloc("disk ", dp->name,
1561 ", all estimate failed", NULL);
1565 enqueue_disk(&failq, dp);
1567 fprintf(stderr, "error result for host %s disk %s: missing estimate\n",
1568 dp->host->hostname, dp->name);
1570 est(dp)->errstr = vstralloc("missing result for ", dp->name,
1571 " in ", dp->host->hostname, " response",
1581 *msgdisk_undo = msgdisk_undo_ch;
1582 msgdisk_undo = NULL;
1584 fprintf(stderr,"got a bad message, stopped at:\n");
1585 fprintf(stderr,"----\n%s\n----\n\n", line);
1586 errbuf = stralloc2("badly formatted response from ", hostp->hostname);
1587 /* fall through to ... */
1592 *msgdisk_undo = msgdisk_undo_ch;
1593 msgdisk_undo = NULL;
1596 for(dp = hostp->disks; dp != NULL; dp = dp->hostnext) {
1597 if(est(dp)->state == DISK_ACTIVE) {
1598 est(dp)->state = DISK_DONE;
1599 remove_disk(&waitq, dp);
1600 enqueue_disk(&failq, dp);
1603 est(dp)->errstr = stralloc(errbuf);
1604 fprintf(stderr, "%s: time %s: error result for host %s disk %s: %s\n",
1605 get_pname(), walltime_str(curclock()),
1606 dp->host->hostname, dp->name, errbuf);
1611 * If there were no disks involved, make sure the error gets
1614 log_add(L_ERROR, "%s", errbuf);
1616 hostp->up = HOST_DONE;
1624 * ========================================================================
1629 static int schedule_order P((disk_t *a, disk_t *b)); /* subroutines */
1630 static int pick_inclevel P((disk_t *dp));
1632 static void analyze_estimate(dp)
1641 fprintf(stderr, "pondering %s:%s... ",
1642 dp->host->hostname, dp->name);
1643 fprintf(stderr, "next_level0 %d last_level %d ",
1644 ep->next_level0, ep->last_level);
1646 if(get_info(dp->host->hostname, dp->name, &info) == 0) {
1650 ep->degr_level = -1;
1653 if(ep->next_level0 <= 0
1654 || (have_info && ep->last_level == 0 && (info.command & FORCE_NO_BUMP))) {
1655 if(ep->next_level0 <= 0) {
1656 fprintf(stderr,"(due for level 0) ");
1659 ep->dump_size = est_tape_size(dp, 0);
1660 if(ep->dump_size <= 0) {
1662 "(no estimate for level 0, picking an incr level)\n");
1663 ep->dump_level = pick_inclevel(dp);
1664 ep->dump_size = est_tape_size(dp, ep->dump_level);
1666 if(ep->dump_size == -1) {
1667 ep->dump_level = ep->dump_level + 1;
1668 ep->dump_size = est_tape_size(dp, ep->dump_level);
1672 total_lev0 += (double) ep->dump_size;
1673 if(ep->last_level == -1 || dp->skip_incr) {
1674 fprintf(stderr,"(%s disk, can't switch to degraded mode)\n",
1675 dp->skip_incr? "skip-incr":"new");
1676 ep->degr_level = -1;
1680 /* fill in degraded mode info */
1681 fprintf(stderr,"(picking inclevel for degraded mode)");
1682 ep->degr_level = pick_inclevel(dp);
1683 ep->degr_size = est_tape_size(dp, ep->degr_level);
1684 if(ep->degr_size == -1) {
1685 ep->degr_level = ep->degr_level + 1;
1686 ep->degr_size = est_tape_size(dp, ep->degr_level);
1688 if(ep->degr_size == -1) {
1689 fprintf(stderr,"(no inc estimate)");
1690 ep->degr_level = -1;
1692 fprintf(stderr,"\n");
1697 fprintf(stderr,"(not due for a full dump, picking an incr level)\n");
1698 /* XXX - if this returns -1 may be we should force a total? */
1699 ep->dump_level = pick_inclevel(dp);
1700 ep->dump_size = est_tape_size(dp, ep->dump_level);
1702 if(ep->dump_size == -1) {
1703 ep->dump_level = ep->last_level;
1704 ep->dump_size = est_tape_size(dp, ep->dump_level);
1706 if(ep->dump_size == -1) {
1707 ep->dump_level = ep->last_level + 1;
1708 ep->dump_size = est_tape_size(dp, ep->dump_level);
1710 if(ep->dump_size == -1) {
1712 ep->dump_size = est_tape_size(dp, ep->dump_level);
1716 fprintf(stderr," curr level %d size %ld ", ep->dump_level, ep->dump_size);
1718 insert_disk(&schedq, dp, schedule_order);
1720 total_size += tt_blocksize_kb + ep->dump_size + tape_mark;
1722 /* update the balanced size */
1723 if(!(dp->skip_full || dp->strategy == DS_NOFULL ||
1724 dp->strategy == DS_INCRONLY)) {
1727 lev0size = est_tape_size(dp, 0);
1728 if(lev0size == -1) lev0size = ep->last_lev0size;
1730 balanced_size += lev0size / runs_per_cycle;
1733 fprintf(stderr,"total size %ld total_lev0 %1.0f balanced-lev0size %1.0f\n",
1734 total_size, total_lev0, balanced_size);
1737 static void handle_failed(dp)
1743 * From George Scott <George.Scott@cc.monash.edu.au>:
1745 * If a machine is down when the planner is run it guesses from historical
1746 * data what the size of tonights dump is likely to be and schedules a
1747 * dump anyway. The dumper then usually discovers that that machine is
1748 * still down and ends up with a half full tape. Unfortunately the
1749 * planner had to delay another dump because it thought that the tape was
1750 * full. The fix here is for the planner to ignore unavailable machines
1751 * rather than ignore the fact that they are unavailable.
1756 if(est(dp)->last_level != -1) {
1758 "Could not get estimate for %s:%s, using historical data.",
1759 dp->host->hostname, dp->name);
1760 analyze_estimate(dp);
1765 errstr = est(dp)->errstr? est(dp)->errstr : "hmm, no error indicator!";
1767 fprintf(stderr, "%s: FAILED %s %s %s 0 [%s]\n",
1768 get_pname(), dp->host->hostname, dp->name, datestamp, errstr);
1770 log_add(L_FAIL, "%s %s %s 0 [%s]", dp->host->hostname, dp->name, datestamp, errstr);
1772 /* XXX - memory leak with *dp */
1776 static int schedule_order(a, b)
1779 * insert-sort by decreasing priority, then
1780 * by decreasing size within priority levels.
1786 diff = est(b)->dump_priority - est(a)->dump_priority;
1787 if(diff != 0) return diff;
1789 ldiff = est(b)->dump_size - est(a)->dump_size;
1790 if(ldiff < 0) return -1; /* XXX - there has to be a better way to dothis */
1791 if(ldiff > 0) return 1;
1796 static int pick_inclevel(dp)
1799 int base_level, bump_level;
1800 long base_size, bump_size;
1803 base_level = est(dp)->last_level;
1805 /* if last night was level 0, do level 1 tonight, no ifs or buts */
1806 if(base_level == 0) {
1807 fprintf(stderr," picklev: last night 0, so tonight level 1\n");
1811 /* if no-full option set, always do level 1 */
1812 if(dp->strategy == DS_NOFULL) {
1813 fprintf(stderr," picklev: no-full set, so always level 1\n");
1817 base_size = est_size(dp, base_level);
1819 /* if we didn't get an estimate, we can't do an inc */
1820 if(base_size == -1) {
1821 base_size = est_size(dp, base_level+1);
1822 if(base_size > 0) /* FORCE_BUMP */
1823 return base_level+1;
1824 fprintf(stderr," picklev: no estimate for level %d, so no incs\n", base_level);
1828 thresh = bump_thresh(base_level);
1831 " pick: size %ld level %d days %d (thresh %ldK, %d days)\n",
1832 base_size, base_level, est(dp)->level_days,
1833 thresh, conf_bumpdays);
1836 || est(dp)->level_days < conf_bumpdays
1837 || base_size <= thresh)
1840 bump_level = base_level + 1;
1841 bump_size = est_size(dp, bump_level);
1843 if(bump_size == -1) return base_level;
1845 fprintf(stderr, " pick: next size %ld... ", bump_size);
1847 if(base_size - bump_size < thresh) {
1848 fprintf(stderr, "not bumped\n");
1852 fprintf(stderr, "BUMPED\n");
1853 log_add(L_INFO, "Incremental of %s:%s bumped to level %d.",
1854 dp->host->hostname, dp->name, bump_level);
1863 ** ========================================================================
1866 ** We have two strategies here:
1870 ** If we are trying to fit too much on the tape something has to go. We
1871 ** try to delay totals until tomorrow by converting them into incrementals
1872 ** and, if that is not effective enough, dropping incrementals altogether.
1873 ** While we are searching for the guilty dump (the one that is really
1874 ** causing the schedule to be oversize) we have probably trampled on a lot of
1875 ** innocent dumps, so we maintain a "before image" list and use this to
1876 ** put back what we can.
1878 ** 2. Promote dumps.
1880 ** We try to keep the amount of tape used by total dumps the same each night.
1881 ** If there is some spare tape in this run we have a look to see if any of
1882 ** tonights incrementals could be promoted to totals and leave us with a
1883 ** more balanced cycle.
1886 static void delay_one_dump P((disk_t *dp, int delete, ...));
1888 static void delay_dumps P((void))
1889 /* delay any dumps that will not fit */
1891 disk_t *dp, *ndp, *preserve;
1893 long new_total; /* New total_size */
1894 char est_kb[20]; /* Text formatted dump size */
1895 int nb_forced_level_0;
1898 biq.head = biq.tail = NULL;
1901 ** 1. Delay dumps that are way oversize.
1903 ** Dumps larger that the size of the tapes we are using are just plain
1904 ** not going to fit no matter how many other dumps we drop. Delay
1905 ** oversize totals until tomorrow (by which time my owner will have
1906 ** resolved the problem!) and drop incrementals altogether. Naturally
1907 ** a large total might be delayed into a large incremental so these
1908 ** need to be checked for separately.
1911 for(dp = schedq.head; dp != NULL; dp = ndp) {
1912 ndp = dp->next; /* remove_disk zaps this */
1914 if (est(dp)->dump_size <= tape->length) {
1918 /* Format dumpsize for messages */
1919 ap_snprintf(est_kb, 20, "%ld KB,", est(dp)->dump_size);
1921 if(est(dp)->dump_level == 0) {
1922 if(est(dp)->last_level == -1 || dp->skip_incr) {
1923 delay_one_dump(dp, 1,
1924 "dump larger than tape,",
1926 "but cannot incremental dump",
1927 dp->skip_incr ? "skip-incr": "new",
1932 delay_one_dump(dp, 0,
1933 "dump larger than tape,",
1935 "full dump delayed",
1940 delay_one_dump(dp, 1,
1941 "dump larger than tape,",
1943 "skipping incremental",
1949 ** 2. Delay total dumps.
1951 ** Delay total dumps until tomorrow (or the day after!). We start with
1952 ** the lowest priority (most dispensable) and work forwards. We take
1953 ** care not to delay *all* the dumps since this could lead to a stale
1954 ** mate [for any one disk there are only three ways tomorrows dump will
1955 ** be smaller than todays: 1. we do a level 0 today so tomorows dump
1956 ** will be a level 1; 2. the disk gets more data so that it is bumped
1957 ** tomorrow (this can be a slow process); and, 3. the disk looses some
1958 ** data (when does that ever happen?)].
1961 nb_forced_level_0 = 0;
1963 for(dp = schedq.head; dp != NULL && preserve == NULL; dp = dp->next)
1964 if(est(dp)->dump_level == 0)
1967 /* 2.a. Do not delay forced full */
1968 for(dp = schedq.tail;
1969 dp != NULL && total_size > tape_length;
1973 if(est(dp)->dump_level != 0) continue;
1975 get_info(dp->host->hostname, dp->name, &info);
1976 if(info.command & FORCE_FULL) {
1977 nb_forced_level_0 += 1;
1982 if(dp != preserve) {
1984 /* Format dumpsize for messages */
1985 ap_snprintf(est_kb, 20, "%ld KB,", est(dp)->dump_size);
1987 if(est(dp)->last_level == -1 || dp->skip_incr) {
1988 delay_one_dump(dp, 1,
1991 "but cannot incremental dump",
1992 dp->skip_incr ? "skip-incr": "new",
1997 delay_one_dump(dp, 0,
2000 "full dump delayed",
2006 /* 2.b. Delay forced full if needed */
2007 if(nb_forced_level_0 > 0 && total_size > tape_length) {
2008 for(dp = schedq.tail;
2009 dp != NULL && total_size > tape_length;
2013 if(est(dp)->dump_level == 0 && dp != preserve) {
2015 /* Format dumpsize for messages */
2016 ap_snprintf(est_kb, 20, "%ld KB,", est(dp)->dump_size);
2018 if(est(dp)->last_level == -1 || dp->skip_incr) {
2019 delay_one_dump(dp, 1,
2022 "but cannot incremental dump",
2023 dp->skip_incr ? "skip-incr": "new",
2028 delay_one_dump(dp, 0,
2031 "full dump delayed",
2039 ** 3. Delay incremental dumps.
2041 ** Delay incremental dumps until tomorrow. This is a last ditch attempt
2042 ** at making things fit. Again, we start with the lowest priority (most
2043 ** dispensable) and work forwards.
2046 for(dp = schedq.tail;
2047 dp != NULL && total_size > tape_length;
2051 if(est(dp)->dump_level != 0) {
2053 /* Format dumpsize for messages */
2054 ap_snprintf(est_kb, 20, "%ld KB,", est(dp)->dump_size);
2056 delay_one_dump(dp, 1,
2057 "dumps way too big,",
2059 "must skip incremental dumps",
2065 ** 4. Reinstate delayed dumps.
2067 ** We might not have needed to stomp on all of the dumps we have just
2068 ** delayed above. Try to reinstate them all starting with the last one
2069 ** and working forwards. It is unlikely that the last one will fit back
2070 ** in but why complicate the code?
2073 for(bi = biq.tail; bi != NULL; bi = nbi) {
2078 new_total = total_size + tt_blocksize_kb + bi->size + tape_mark;
2080 new_total = total_size - est(dp)->dump_size + bi->size;
2082 if(new_total <= tape_length && bi->size < tape->length) {
2084 total_size = new_total;
2086 if(bi->level == 0) {
2087 total_lev0 += (double) bi->size;
2089 insert_disk(&schedq, dp, schedule_order);
2092 est(dp)->dump_level = bi->level;
2093 est(dp)->dump_size = bi->size;
2097 if(bi->next == NULL)
2098 biq.tail = bi->prev;
2100 (bi->next)->prev = bi->prev;
2101 if(bi->prev == NULL)
2102 biq.head = bi->next;
2104 (bi->prev)->next = bi->next;
2111 ** 5. Output messages about what we have done.
2113 ** We can't output messages while we are delaying dumps because we might
2114 ** reinstate them later. We remember all the messages and output them
2118 for(bi = biq.head; bi != NULL; bi = nbi) {
2122 fprintf(stderr, "%s: FAILED %s\n", get_pname(), bi->errstr);
2123 log_add(L_FAIL, "%s", bi->errstr);
2127 fprintf(stderr, " delay: %s now at level %d\n",
2128 bi->errstr, est(dp)->dump_level);
2129 log_add(L_INFO, "%s", bi->errstr);
2132 /* Clean up - dont be too fancy! */
2137 fprintf(stderr, " delay: Total size now %ld.\n", total_size);
2144 * Remove a dump or modify it from full to incremental.
2145 * Keep track of it on the bi q in case we can add it back later.
2147 arglist_function1(static void delay_one_dump,
2153 char level_str[NUM_STR_SIZE];
2157 arglist_start(argp, delete);
2159 total_size -= tt_blocksize_kb + est(dp)->dump_size + tape_mark;
2160 if(est(dp)->dump_level == 0) {
2161 total_lev0 -= (double) est(dp)->dump_size;
2164 bi = alloc(sizeof(bi_t));
2166 bi->prev = biq.tail;
2167 if(biq.tail == NULL)
2170 biq.tail->next = bi;
2173 bi->deleted = delete;
2175 bi->level = est(dp)->dump_level;
2176 bi->size = est(dp)->dump_size;
2178 ap_snprintf(level_str, sizeof(level_str), "%d", est(dp)->dump_level);
2179 bi->errstr = vstralloc(dp->host->hostname,
2181 " ", datestamp ? datestamp : "?",
2185 while ((next = arglist_val(argp, char *)) != NULL) {
2186 bi->errstr = newvstralloc(bi->errstr, bi->errstr, sep, next, NULL);
2189 strappend(bi->errstr, "]");
2193 remove_disk(&schedq, dp);
2195 est(dp)->dump_level = est(dp)->degr_level;
2196 est(dp)->dump_size = est(dp)->degr_size;
2197 total_size += tt_blocksize_kb + est(dp)->dump_size + tape_mark;
2204 static int promote_highest_priority_incremental P((void))
2206 disk_t *dp, *dp1, *dp_promote;
2207 long new_size, new_total, new_lev0;
2209 int nb_today, nb_same_day, nb_today2;
2210 int nb_disk_today, nb_disk_same_day;
2213 * return 1 if did so; must update total_size correctly; must not
2214 * cause total_size to exceed tape_length
2218 for(dp = schedq.head; dp != NULL; dp = dp->next) {
2220 est(dp)->promote = -1000;
2222 if(est_size(dp,0) <= 0)
2225 if(est(dp)->next_level0 <= 0)
2228 if(est(dp)->next_level0 > dp->maxpromoteday)
2231 new_size = est_tape_size(dp, 0);
2232 new_total = total_size - est(dp)->dump_size + new_size;
2233 new_lev0 = total_lev0 + new_size;
2238 nb_disk_same_day = 0;
2239 for(dp1 = schedq.head; dp1 != NULL; dp1 = dp1->next) {
2240 if(est(dp1)->dump_level == 0)
2242 else if(est(dp1)->next_level0 == est(dp)->next_level0)
2244 if(strcmp(dp->host->hostname, dp1->host->hostname) == 0) {
2245 if(est(dp1)->dump_level == 0)
2247 else if(est(dp1)->next_level0 == est(dp)->next_level0)
2252 /* do not promote if overflow tape */
2253 if(new_total > tape_length) continue;
2255 /* do not promote if overflow balanced size and something today */
2256 /* promote if nothing today */
2257 if(new_lev0 > balanced_size+balance_threshold && nb_disk_today > 0)
2260 /* do not promote if only one disk due that day and nothing today */
2261 if(nb_disk_same_day == 1 && nb_disk_today == 0) continue;
2263 nb_today2 = nb_today*nb_today;
2264 if(nb_today == 0 && nb_same_day > 1) nb_same_day++;
2266 if(nb_same_day >= nb_today2) {
2267 est(dp)->promote = ((nb_same_day - nb_today2)*(nb_same_day - nb_today2)) +
2268 conf_dumpcycle - est(dp)->next_level0;
2271 est(dp)->promote = -nb_today2 +
2272 conf_dumpcycle - est(dp)->next_level0;
2275 if(!dp_promote || est(dp_promote)->promote < est(dp)->promote) {
2277 fprintf(stderr," try %s:%s %d %d %d = %d\n",
2278 dp->host->hostname, dp->name, nb_same_day, nb_today, est(dp)->next_level0, est(dp)->promote);
2281 fprintf(stderr,"no try %s:%s %d %d %d = %d\n",
2282 dp->host->hostname, dp->name, nb_same_day, nb_today, est(dp)->next_level0, est(dp)->promote);
2289 new_size = est_tape_size(dp, 0);
2290 new_total = total_size - est(dp)->dump_size + new_size;
2291 new_lev0 = total_lev0 + new_size;
2293 total_size = new_total;
2294 total_lev0 = new_lev0;
2295 check_days = est(dp)->next_level0;
2296 est(dp)->degr_level = est(dp)->dump_level;
2297 est(dp)->degr_size = est(dp)->dump_size;
2298 est(dp)->dump_level = 0;
2299 est(dp)->dump_size = new_size;
2300 est(dp)->next_level0 = 0;
2303 " promote: moving %s:%s up, total_lev0 %1.0f, total_size %ld\n",
2304 dp->host->hostname, dp->name,
2305 total_lev0, total_size);
2308 "Full dump of %s:%s promoted from %d day%s ahead.",
2309 dp->host->hostname, dp->name,
2310 check_days, (check_days == 1) ? "" : "s");
2317 static int promote_hills P((void))
2320 struct balance_stats {
2331 /* If we are already doing a level 0 don't bother */
2335 /* Do the guts of an "amadmin balance" */
2336 tapecycle = conf_tapecycle;
2338 sp = (struct balance_stats *)
2339 alloc(sizeof(struct balance_stats) * tapecycle);
2341 for(days = 0; days < tapecycle; days++)
2342 sp[days].disks = sp[days].size = 0;
2344 for(dp = schedq.head; dp != NULL; dp = dp->next) {
2345 days = est(dp)->next_level0; /* This is > 0 by definition */
2346 if(days<tapecycle && !dp->skip_full && dp->strategy != DS_NOFULL &&
2347 dp->strategy != DS_INCRONLY) {
2349 sp[days].size += est(dp)->last_lev0size;
2353 /* Search for a suitable big hill and cut it down */
2355 /* Find the tallest hill */
2357 for(days = 0; days < tapecycle; days++) {
2358 if(sp[days].disks > 1 && sp[days].size > hill_size) {
2359 hill_size = sp[days].size;
2364 if(hill_size <= 0) break; /* no suitable hills */
2366 /* Find all the dumps in that hill and try and remove one */
2367 for(dp = schedq.head; dp != NULL; dp = dp->next) {
2368 if(est(dp)->next_level0 != hill_days ||
2369 est(dp)->next_level0 > dp->maxpromoteday ||
2371 dp->strategy == DS_NOFULL ||
2372 dp->strategy == DS_INCRONLY)
2374 new_size = est_tape_size(dp, 0);
2375 new_total = total_size - est(dp)->dump_size + new_size;
2376 if(new_total > tape_length)
2378 /* We found a disk we can promote */
2379 total_size = new_total;
2380 total_lev0 += new_size;
2381 est(dp)->degr_level = est(dp)->dump_level;
2382 est(dp)->degr_size = est(dp)->dump_size;
2383 est(dp)->dump_level = 0;
2384 est(dp)->next_level0 = 0;
2385 est(dp)->dump_size = new_size;
2388 " promote: moving %s:%s up, total_lev0 %1.0f, total_size %ld\n",
2389 dp->host->hostname, dp->name,
2390 total_lev0, total_size);
2393 "Full dump of %s:%s specially promoted from %d day%s ahead.",
2394 dp->host->hostname, dp->name,
2395 hill_days, (hill_days == 1) ? "" : "s");
2400 /* All the disks in that hill were unsuitable. */
2401 sp[hill_days].disks = 0; /* Don't get tricked again */
2409 * ========================================================================
2412 * XXX - memory leak - we shouldn't just throw away *dp
2414 static void output_scheduleline(dp)
2418 long dump_time = 0, degr_time = 0;
2419 char *schedline = NULL, *degr_str = NULL;
2420 char dump_priority_str[NUM_STR_SIZE];
2421 char dump_level_str[NUM_STR_SIZE];
2422 char dump_size_str[NUM_STR_SIZE];
2423 char dump_time_str[NUM_STR_SIZE];
2424 char degr_level_str[NUM_STR_SIZE];
2425 char degr_size_str[NUM_STR_SIZE];
2426 char degr_time_str[NUM_STR_SIZE];
2427 char *dump_date, *degr_date;
2433 if(ep->dump_size == -1) {
2434 /* no estimate, fail the disk */
2436 "%s: FAILED %s %s %s %d [no estimate]\n",
2438 dp->host->hostname, dp->name, datestamp, ep->dump_level);
2439 log_add(L_FAIL, "%s %s %s %d [no estimate]",
2440 dp->host->hostname, dp->name, datestamp, ep->dump_level);
2444 dump_date = degr_date = (char *)0;
2445 for(i = 0; i < MAX_LEVELS; i++) {
2446 if(ep->dump_level == ep->level[i])
2447 dump_date = ep->dumpdate[i];
2448 if(ep->degr_level == ep->level[i])
2449 degr_date = ep->dumpdate[i];
2452 #define fix_rate(rate) (rate < 1.0 ? DEFAULT_DUMPRATE : rate)
2454 if(ep->dump_level == 0) {
2455 dump_time = ep->dump_size / fix_rate(ep->fullrate);
2457 if(ep->degr_size != -1) {
2458 degr_time = ep->degr_size / fix_rate(ep->incrrate);
2462 dump_time = ep->dump_size / fix_rate(ep->incrrate);
2465 if(ep->dump_level == 0 && ep->degr_size != -1) {
2466 ap_snprintf(degr_level_str, sizeof(degr_level_str),
2467 "%d", ep->degr_level);
2468 ap_snprintf(degr_size_str, sizeof(degr_size_str),
2469 "%ld", ep->degr_size);
2470 ap_snprintf(degr_time_str, sizeof(degr_time_str),
2472 degr_str = vstralloc(" ", degr_level_str,
2478 ap_snprintf(dump_priority_str, sizeof(dump_priority_str),
2479 "%d", ep->dump_priority);
2480 ap_snprintf(dump_level_str, sizeof(dump_level_str),
2481 "%d", ep->dump_level);
2482 ap_snprintf(dump_size_str, sizeof(dump_size_str),
2483 "%ld", ep->dump_size);
2484 ap_snprintf(dump_time_str, sizeof(dump_time_str),
2486 features = am_feature_to_string(dp->host->features);
2487 schedline = vstralloc("DUMP ",dp->host->hostname,
2491 " ", dump_priority_str,
2492 " ", dump_level_str,
2496 degr_str ? degr_str : "",
2499 fputs(schedline, stdout);
2500 fputs(schedline, stderr);