2 * Amanda, The Advanced Maryland Automatic Network Disk Archiver
3 * Copyright (c) 1991-1998 University of Maryland at College Park
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of U.M. not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. U.M. makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * U.M. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL U.M.
18 * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
23 * Authors: the Amanda Development Team. Its members are listed in a
24 * file named AUTHORS, in the root directory of this distribution.
27 * $Id: driver.c,v 1.164 2006/03/22 15:07:08 martinea Exp $
29 * controlling process for the Amanda backup system
33 * XXX possibly modify tape queue to be cognizant of how much room is left on
34 * tape. Probably not effective though, should do this in planner.
37 /*#define HOLD_DEBUG*/
50 #include "server_util.h"
52 static disklist_t waitq, runq, tapeq, roomq;
53 static int pending_aborts;
54 static disk_t *taper_disk;
55 static int degraded_mode;
56 static unsigned long reserved_space;
57 static unsigned long total_disksize;
58 static char *dumper_program;
59 static char *chunker_program;
60 static int inparallel;
61 static int nodump = 0;
62 static unsigned long tape_length, tape_left = 0;
63 static int current_tape = 1;
64 static int conf_taperalgo;
65 static int conf_runtapes;
66 static time_t sleep_time;
67 static int idle_reason;
68 static char *datestamp;
69 static char *timestamp;
70 static am_host_t *flushhost = NULL;
71 static int need_degraded=0;
73 static event_handle_t *dumpers_ev_time = NULL;
74 static event_handle_t *schedule_ev_read = NULL;
76 static void allocate_bandwidth P((interface_t *ip, int kps));
77 static int assign_holdingdisk P((assignedhd_t **holdp, disk_t *diskp));
78 static void adjust_diskspace P((disk_t *diskp, cmd_t cmd));
79 static void delete_diskspace P((disk_t *diskp));
80 static assignedhd_t **build_diskspace P((char *destname));
81 static int client_constrained P((disk_t *dp));
82 static void deallocate_bandwidth P((interface_t *ip, int kps));
83 static void dump_schedule P((disklist_t *qp, char *str));
84 static int dump_to_tape P((disk_t *dp));
85 static assignedhd_t **find_diskspace P((unsigned long size, int *cur_idle,
86 assignedhd_t *preferred));
87 static int free_kps P((interface_t *ip));
88 static unsigned long free_space P((void));
89 static void dumper_result P((disk_t *dp));
90 static void handle_dumper_result P((void *));
91 static void handle_chunker_result P((void *));
92 static void handle_dumpers_time P((void *));
93 static void handle_taper_result P((void *));
94 static void holdingdisk_state P((char *time_str));
95 static dumper_t *idle_dumper P((void));
96 static void interface_state P((char *time_str));
97 static int num_busy_dumpers P((void));
98 static int queue_length P((disklist_t q));
99 static disklist_t read_flush P((void));
100 static void read_schedule P((void *cookie));
101 static void short_dump_state P((void));
102 static void startaflush P((void));
103 static void start_degraded_mode P((disklist_t *queuep));
104 static void start_some_dumps P((disklist_t *rq));
105 static void continue_port_dumps();
106 static void update_failed_dump_to_tape P((disk_t *));
108 static void dump_state P((const char *str));
110 int main P((int main_argc, char **main_argv));
112 static const char *idle_strings[] = {
115 #define IDLE_NO_DUMPERS 1
117 #define IDLE_START_WAIT 2
119 #define IDLE_NO_HOLD 3
121 #define IDLE_CLIENT_CONSTRAINED 4
122 "client-constrained",
123 #define IDLE_NO_DISKSPACE 5
125 #define IDLE_TOO_LARGE 6
127 #define IDLE_NO_BANDWIDTH 7
129 #define IDLE_TAPER_WAIT 8
134 main(main_argc, main_argv)
143 generic_fs_stats_t fs;
145 unsigned long malloc_hist_1, malloc_size_1;
146 unsigned long malloc_hist_2, malloc_size_2;
147 unsigned long reserve = 100;
152 char *result_argv[MAX_ARGS+1];
160 setvbuf(stdout, (char *)NULL, _IOLBF, 0);
161 setvbuf(stderr, (char *)NULL, _IOLBF, 0);
165 /* Don't die when child closes pipe */
166 signal(SIGPIPE, SIG_IGN);
168 malloc_size_1 = malloc_inuse(&malloc_hist_1);
170 erroutput_type = (ERR_AMANDALOG|ERR_INTERACTIVE);
171 set_logerror(logerror);
175 printf("%s: pid %ld executable %s version %s\n",
176 get_pname(), (long) getpid(), main_argv[0], version());
179 config_name = stralloc(main_argv[1]);
180 config_dir = vstralloc(CONFIG_DIR, "/", config_name, "/", NULL);
182 if(strncmp(main_argv[2], "nodump", 6) == 0) {
188 char my_cwd[STR_SIZE];
190 if (getcwd(my_cwd, sizeof(my_cwd)) == NULL) {
191 error("cannot determine current working directory");
193 config_dir = stralloc2(my_cwd, "/");
194 if ((config_name = strrchr(my_cwd, '/')) != NULL) {
195 config_name = stralloc(config_name + 1);
201 conffile = stralloc2(config_dir, CONFFILE_NAME);
202 if(read_conffile(conffile)) {
203 error("errors processing config file \"%s\"", conffile);
208 datestamp = construct_datestamp(NULL);
209 timestamp = construct_timestamp(NULL);
210 log_add(L_START,"date %s", datestamp);
212 taper_program = vstralloc(libexecdir, "/", "taper", versionsuffix(), NULL);
213 dumper_program = vstralloc(libexecdir, "/", "dumper", versionsuffix(),
215 chunker_program = vstralloc(libexecdir, "/", "chunker", versionsuffix(),
218 conf_taperalgo = getconf_int(CNF_TAPERALGO);
219 conf_tapetype = getconf_str(CNF_TAPETYPE);
220 conf_runtapes = getconf_int(CNF_RUNTAPES);
221 tape = lookup_tapetype(conf_tapetype);
222 tape_length = tape->length;
223 printf("driver: tape size %ld\n", tape_length);
225 /* taper takes a while to get going, so start it up right away */
228 if(conf_runtapes > 0) {
229 startup_tape_process(taper_program);
230 taper_cmd(START_TAPER, datestamp, NULL, 0, NULL);
233 /* start initializing: read in databases */
235 conf_diskfile = getconf_str(CNF_DISKFILE);
236 if (*conf_diskfile == '/') {
237 conf_diskfile = stralloc(conf_diskfile);
239 conf_diskfile = stralloc2(config_dir, conf_diskfile);
241 if (read_diskfile(conf_diskfile, &origq) < 0)
242 error("could not load disklist \"%s\"", conf_diskfile);
243 amfree(conf_diskfile);
245 /* set up any configuration-dependent variables */
247 inparallel = getconf_int(CNF_INPARALLEL);
249 reserve = getconf_int(CNF_RESERVE);
252 for(hdp = getconf_holdingdisks(), dsk = 0; hdp != NULL; hdp = hdp->next, dsk++) {
253 hdp->up = (void *)alloc(sizeof(holdalloc_t));
254 holdalloc(hdp)->allocated_dumpers = 0;
255 holdalloc(hdp)->allocated_space = 0L;
257 if(get_fs_stats(hdp->diskdir, &fs) == -1
258 || access(hdp->diskdir, W_OK) == -1) {
259 log_add(L_WARNING, "WARNING: ignoring holding disk %s: %s\n",
260 hdp->diskdir, strerror(errno));
266 if(hdp->disksize > 0) {
267 if(hdp->disksize > fs.avail) {
269 "WARNING: %s: %ld KB requested, but only %ld KB available.",
270 hdp->diskdir, hdp->disksize, fs.avail);
271 hdp->disksize = fs.avail;
274 else if(fs.avail + hdp->disksize < 0) {
276 "WARNING: %s: not %ld KB free.",
277 hdp->diskdir, -hdp->disksize);
282 hdp->disksize += fs.avail;
285 printf("driver: adding holding disk %d dir %s size %ld chunksize %ld\n",
286 dsk, hdp->diskdir, hdp->disksize, hdp->chunksize);
288 newdir = newvstralloc(newdir,
289 hdp->diskdir, "/", timestamp,
291 if(!mkholdingdir(newdir)) {
294 total_disksize += hdp->disksize;
297 reserved_space = total_disksize * (reserve / 100.0);
299 printf("reserving %ld out of %ld for degraded-mode dumps\n",
300 reserved_space, free_space());
304 if(inparallel > MAX_DUMPERS) inparallel = MAX_DUMPERS;
306 /* fire up the dumpers now while we are waiting */
307 if(!nodump) startup_dump_processes(dumper_program, inparallel);
310 * Read schedule from stdin. Usually, this is a pipe from planner,
311 * so the effect is that we wait here for the planner to
312 * finish, but meanwhile the taper is rewinding the tape, reading
313 * the label, checking it, writing a new label and all that jazz
314 * in parallel with the planner.
320 tapeq = read_flush();
322 roomq.head = roomq.tail = NULL;
324 log_add(L_STATS, "startup time %s", walltime_str(curclock()));
326 printf("driver: start time %s inparallel %d bandwidth %d diskspace %lu",
327 walltime_str(curclock()), inparallel, free_kps((interface_t *)0),
329 printf(" dir %s datestamp %s driver: drain-ends tapeq %s big-dumpers %s\n",
330 "OBSOLETE", datestamp, taperalgo2str(conf_taperalgo),
331 getconf_str(CNF_DUMPORDER));
334 /* ok, planner is done, now lets see if the tape is ready */
336 if(conf_runtapes > 0) {
337 cmd = getresult(taper, 1, &result_argc, result_argv, MAX_ARGS+1);
339 if(cmd != TAPER_OK) {
340 /* no tape, go into degraded mode: dump to holding disk */
348 tape_left = tape_length;
351 taper_ev_read = NULL;
352 if(!need_degraded) startaflush();
355 schedule_ev_read = event_register(0, EV_READFD, read_schedule, NULL);
360 /* handle any remaining dumps by dumping directly to tape, if possible */
362 while(!empty(runq) && taper > 0) {
363 diskp = dequeue_disk(&runq);
365 int rc = dump_to_tape(diskp);
368 "%s %s %d [dump to tape failed, will try again]",
369 diskp->host->hostname,
371 sched(diskp)->level);
373 log_add(L_FAIL, "%s %s %s %d [dump to tape failed]",
374 diskp->host->hostname,
376 sched(diskp)->datestamp,
377 sched(diskp)->level);
380 log_add(L_FAIL, "%s %s %s %d [%s]",
381 diskp->host->hostname, diskp->name, sched(diskp)->datestamp,
384 "can't dump no-hold disk in degraded mode" :
385 "no more holding disk space");
388 short_dump_state(); /* for amstatus */
390 printf("driver: QUITTING time %s telling children to quit\n",
391 walltime_str(curclock()));
395 for(dumper = dmptable; dumper < dmptable + inparallel; dumper++) {
396 dumper_cmd(dumper, QUIT, NULL);
401 taper_cmd(QUIT, NULL, NULL, 0, NULL);
404 /* wait for all to die */
407 char number[NUM_STR_SIZE];
413 if((pid = wait(&retstat)) == -1) {
414 if(errno == EINTR) continue;
418 if(! WIFEXITED(retstat)) {
420 code = WTERMSIG(retstat);
421 } else if(WEXITSTATUS(retstat) != 0) {
423 code = WEXITSTATUS(retstat);
426 for(dumper = dmptable; dumper < dmptable + inparallel; dumper++) {
427 if(pid == dumper->pid) {
428 who = stralloc(dumper->name);
432 if(who == NULL && pid == taper_pid) {
433 who = stralloc("taper");
435 if(what != NULL && who == NULL) {
436 snprintf(number, sizeof(number), "%ld", (long)pid);
437 who = stralloc2("unknown pid ", number);
440 log_add(L_WARNING, "%s exited with %s %d\n", who, what, code);
441 printf("driver: %s exited with %s %d\n", who, what, code);
446 for(dumper = dmptable; dumper < dmptable + inparallel; dumper++) {
447 amfree(dumper->name);
450 for(hdp = getconf_holdingdisks(); hdp != NULL; hdp = hdp->next) {
451 cleanup_holdingdisk(hdp->diskdir, 0);
456 check_unfree_serial();
457 printf("driver: FINISHED time %s\n", walltime_str(curclock()));
459 log_add(L_FINISH,"date %s time %s", datestamp, walltime_str(curclock()));
463 amfree(dumper_program);
464 amfree(taper_program);
468 malloc_size_2 = malloc_inuse(&malloc_hist_2);
470 if(malloc_size_1 != malloc_size_2) {
471 malloc_list(fileno(stderr), malloc_hist_1, malloc_hist_2);
483 unsigned int extra_tapes = 0;
484 if(!degraded_mode && !taper_busy && !empty(tapeq)) {
486 datestamp = sched(tapeq.head)->datestamp;
487 switch(conf_taperalgo) {
489 dp = dequeue_disk(&tapeq);
493 while (fit != NULL) {
494 extra_tapes = (fit->tape_splitsize > 0) ?
495 conf_runtapes - current_tape : 0;
496 if(sched(fit)->act_size <= (tape_left + tape_length*extra_tapes) &&
497 strcmp(sched(fit)->datestamp, datestamp) <= 0) {
505 if(dp) remove_disk(&tapeq, dp);
508 fit = dp = tapeq.head;
509 while (fit != NULL) {
510 if(sched(fit)->act_size > sched(dp)->act_size &&
511 strcmp(sched(fit)->datestamp, datestamp) <= 0) {
516 if(dp) remove_disk(&tapeq, dp);
518 case ALGO_LARGESTFIT:
520 while (fit != NULL) {
521 extra_tapes = (fit->tape_splitsize > 0) ?
522 conf_runtapes - current_tape : 0;
523 if(sched(fit)->act_size <= (tape_left + tape_length*extra_tapes) &&
524 (!dp || sched(fit)->act_size > sched(dp)->act_size) &&
525 strcmp(sched(fit)->datestamp, datestamp) <= 0) {
530 if(dp) remove_disk(&tapeq, dp);
536 remove_disk(&tapeq, dp);
539 if(!dp) { /* ALGO_SMALLEST, or default if nothing fit. */
540 if(conf_taperalgo != ALGO_SMALLEST) {
542 "driver: startaflush: Using SMALLEST because nothing fit\n");
544 fit = dp = tapeq.head;
545 while (fit != NULL) {
546 if(sched(fit)->act_size < sched(dp)->act_size &&
547 strcmp(sched(fit)->datestamp, datestamp) <= 0) {
552 if(dp) remove_disk(&tapeq, dp);
554 if(taper_ev_read == NULL) {
555 taper_ev_read = event_register(taper, EV_READFD,
556 handle_taper_result, NULL);
561 taper_cmd(FILE_WRITE, dp, sched(dp)->destname, sched(dp)->level,
562 sched(dp)->datestamp);
563 fprintf(stderr,"driver: startaflush: %s %s %s %ld %ld\n",
564 taperalgo2str(conf_taperalgo), dp->host->hostname,
565 dp->name, sched(taper_disk)->act_size, tape_left);
566 if(sched(dp)->act_size <= tape_left)
567 tape_left -= sched(dp)->act_size;
571 error("FATAL: Taper marked busy and no work found.");
574 } else if(!taper_busy && taper_ev_read != NULL) {
575 event_release(taper_ev_read);
576 taper_ev_read = NULL;
582 client_constrained(dp)
587 /* first, check if host is too busy */
589 if(dp->host->inprogress >= dp->host->maxdumps) {
593 /* next, check conflict with other dumps on same spindle */
595 if(dp->spindle == -1) { /* but spindle -1 never conflicts by def. */
599 for(dp2 = dp->host->disks; dp2 != NULL; dp2 = dp2->hostnext)
600 if(dp2->inprogress && dp2->spindle == dp->spindle) {
612 disk_t *diskp, *delayed_diskp, *diskp_accept;
613 assignedhd_t **holdp=NULL, **holdp_accept;
614 const time_t now = time(NULL);
617 char *result_argv[MAX_ARGS+1];
623 idle_reason = IDLE_NO_DUMPERS;
626 if(dumpers_ev_time != NULL) {
627 event_release(dumpers_ev_time);
628 dumpers_ev_time = NULL;
631 for (dumper = dmptable; dumper < dmptable+inparallel; dumper++) {
637 if (dumper->ev_read != NULL) {
638 /* assert(dumper->ev_read == NULL);*/
639 event_release(dumper->ev_read);
640 dumper->ev_read = NULL;
644 * A potential problem with starting from the bottom of the dump time
645 * distribution is that a slave host will have both one of the shortest
646 * and one of the longest disks, so starting its shortest disk first will
647 * tie up the host and eliminate its longest disk from consideration the
648 * first pass through. This could cause a big delay in starting that long
649 * disk, which could drag out the whole night's dumps.
651 * While starting from the top of the dump time distribution solves the
652 * above problem, this turns out to be a bad idea, because the big dumps
653 * will almost certainly pack the holding disk completely, leaving no
654 * room for even one small dump to start. This ends up shutting out the
655 * small-end dumpers completely (they stay idle).
657 * The introduction of multiple simultaneous dumps to one host alleviates
658 * the biggest&smallest dumps problem: both can be started at the
664 delayed_diskp = NULL;
668 dumporder = getconf_str(CNF_DUMPORDER);
669 if(strlen(dumporder) > (dumper-dmptable)) {
670 dumptype = dumporder[dumper-dmptable];
673 if(dumper-dmptable < 3)
679 for(diskp = rq->head; diskp != NULL; diskp = diskp->next) {
680 assert(diskp->host != NULL && sched(diskp) != NULL);
682 /* round estimate to next multiple of DISK_BLOCK_KB */
683 sched(diskp)->est_size = am_round(sched(diskp)->est_size,
686 if (diskp->host->start_t > now) {
687 cur_idle = max(cur_idle, IDLE_START_WAIT);
688 if (delayed_diskp == NULL || sleep_time > diskp->host->start_t) {
689 delayed_diskp = diskp;
690 sleep_time = diskp->host->start_t;
692 } else if(diskp->start_t > now) {
693 cur_idle = max(cur_idle, IDLE_START_WAIT);
694 if (delayed_diskp == NULL || sleep_time > diskp->start_t) {
695 delayed_diskp = diskp;
696 sleep_time = diskp->start_t;
698 } else if (diskp->host->netif->curusage > 0 &&
699 sched(diskp)->est_kps > free_kps(diskp->host->netif)) {
700 cur_idle = max(cur_idle, IDLE_NO_BANDWIDTH);
701 } else if(sched(diskp)->no_space) {
702 cur_idle = max(cur_idle, IDLE_NO_DISKSPACE);
704 find_diskspace(sched(diskp)->est_size,&cur_idle,NULL)) == NULL) {
705 cur_idle = max(cur_idle, IDLE_NO_DISKSPACE);
706 } else if (diskp->no_hold) {
707 free_assignedhd(holdp);
708 cur_idle = max(cur_idle, IDLE_NO_HOLD);
709 } else if (client_constrained(diskp)) {
710 free_assignedhd(holdp);
711 cur_idle = max(cur_idle, IDLE_CLIENT_CONSTRAINED);
714 /* disk fits, dump it */
715 int accept = !diskp_accept;
718 case 's': accept = (sched(diskp)->est_size < sched(diskp_accept)->est_size);
720 case 'S': accept = (sched(diskp)->est_size > sched(diskp_accept)->est_size);
722 case 't': accept = (sched(diskp)->est_time < sched(diskp_accept)->est_time);
724 case 'T': accept = (sched(diskp)->est_time > sched(diskp_accept)->est_time);
726 case 'b': accept = (sched(diskp)->est_kps < sched(diskp_accept)->est_kps);
728 case 'B': accept = (sched(diskp)->est_kps > sched(diskp_accept)->est_kps);
730 default: log_add(L_WARNING, "Unknown dumporder character \'%c\', using 's'.\n",
732 accept = (sched(diskp)->est_size < sched(diskp_accept)->est_size);
737 if( !diskp_accept || !degraded_mode || diskp->priority >= diskp_accept->priority) {
738 if(holdp_accept) free_assignedhd(holdp_accept);
739 diskp_accept = diskp;
740 holdp_accept = holdp;
743 free_assignedhd(holdp);
747 free_assignedhd(holdp);
752 diskp = diskp_accept;
753 holdp = holdp_accept;
755 idle_reason = max(idle_reason, cur_idle);
758 * If we have no disk at this point, and there are disks that
759 * are delayed, then schedule a time event to call this dumper
760 * with the disk with the shortest delay.
762 if (diskp == NULL && delayed_diskp != NULL) {
763 assert(sleep_time > now);
765 dumpers_ev_time = event_register(sleep_time, EV_TIME,
766 handle_dumpers_time, &runq);
768 } else if (diskp != NULL) {
769 sched(diskp)->act_size = 0;
770 allocate_bandwidth(diskp->host->netif, sched(diskp)->est_kps);
771 sched(diskp)->activehd = assign_holdingdisk(holdp, diskp);
773 sched(diskp)->destname = newstralloc(sched(diskp)->destname,
774 sched(diskp)->holdp[0]->destname);
775 diskp->host->inprogress++; /* host is now busy */
776 diskp->inprogress = 1;
777 sched(diskp)->dumper = dumper;
778 sched(diskp)->timestamp = now;
780 dumper->ev_read = event_register(dumper->fd, EV_READFD,
781 handle_dumper_result, dumper);
782 dumper->busy = 1; /* dumper is now busy */
783 dumper->dp = diskp; /* link disk to dumper */
784 remove_disk(rq, diskp); /* take it off the run queue */
786 sched(diskp)->origsize = -1;
787 sched(diskp)->dumpsize = -1;
788 sched(diskp)->dumptime = -1;
789 sched(diskp)->tapetime = -1;
790 chunker = dumper->chunker;
791 chunker->result = LAST_TOK;
792 dumper->result = LAST_TOK;
793 startup_chunk_process(chunker,chunker_program);
794 chunker->dumper = dumper;
795 chunker_cmd(chunker, PORT_WRITE, diskp);
796 cmd = getresult(chunker->fd, 1, &result_argc, result_argv, MAX_ARGS+1);
798 printf("driver: did not get PORT from %s for %s:%s\n",
799 chunker->name, diskp->host->hostname, diskp->name);
801 return ; /* fatal problem */
803 chunker->ev_read = event_register(chunker->fd, EV_READFD,
804 handle_chunker_result, chunker);
805 dumper->output_port = atoi(result_argv[2]);
807 dumper_cmd(dumper, PORT_DUMP, diskp);
809 diskp->host->start_t = now + 15;
810 } else if (/* cur_idle != NOT_IDLE && */
811 (num_busy_dumpers() > 0 || taper_busy)) {
813 * We are constrained.
820 * This gets called when a dumper is delayed for some reason. It may
821 * be because a disk has a delayed start, or amanda is constrained
822 * by network or disk limits.
825 handle_dumpers_time(cookie)
828 disklist_t *runq = cookie;
829 event_release(dumpers_ev_time);
830 dumpers_ev_time = NULL;
831 start_some_dumps(runq);
835 dump_schedule(qp, str)
841 printf("dump of driver schedule %s:\n--------\n", str);
843 for(dp = qp->head; dp != NULL; dp = dp->next) {
844 printf(" %-20s %-25s lv %d t %5ld s %8lu p %d\n",
845 dp->host->hostname, dp->name, sched(dp)->level,
846 sched(dp)->est_time, sched(dp)->est_size, sched(dp)->priority);
848 printf("--------\n");
852 start_degraded_mode(queuep)
857 unsigned long est_full_size;
859 if (taper_ev_read != NULL) {
860 event_release(taper_ev_read);
861 taper_ev_read = NULL;
864 newq.head = newq.tail = 0;
866 dump_schedule(queuep, "before start degraded mode");
869 while(!empty(*queuep)) {
870 dp = dequeue_disk(queuep);
872 if(sched(dp)->level != 0)
873 /* go ahead and do the disk as-is */
874 enqueue_disk(&newq, dp);
876 if (reserved_space + est_full_size + sched(dp)->est_size
878 enqueue_disk(&newq, dp);
879 est_full_size += sched(dp)->est_size;
881 else if(sched(dp)->degr_level != -1) {
882 sched(dp)->level = sched(dp)->degr_level;
883 sched(dp)->dumpdate = sched(dp)->degr_dumpdate;
884 sched(dp)->est_size = sched(dp)->degr_size;
885 sched(dp)->est_time = sched(dp)->degr_time;
886 sched(dp)->est_kps = sched(dp)->degr_kps;
887 enqueue_disk(&newq, dp);
890 log_add(L_FAIL,"%s %s %s %d [can't switch to incremental dump]",
891 dp->host->hostname, dp->name, sched(dp)->datestamp,
900 dump_schedule(queuep, "after start degraded mode");
904 static void continue_port_dumps()
908 int active_dumpers=0, busy_dumpers=0, i;
911 /* First we try to grant diskspace to some dumps waiting for it. */
912 for( dp = roomq.head; dp; dp = ndp ) {
914 /* find last holdingdisk used by this dump */
915 for( i = 0, h = sched(dp)->holdp; h[i+1]; i++ );
916 /* find more space */
917 h = find_diskspace( sched(dp)->est_size - sched(dp)->act_size,
918 &active_dumpers, h[i] );
920 for(dumper = dmptable; dumper < dmptable + inparallel &&
921 dumper->dp != dp; dumper++);
922 assert( dumper < dmptable + inparallel );
923 sched(dp)->activehd = assign_holdingdisk( h, dp );
924 chunker_cmd( dumper->chunker, CONTINUE, dp );
926 remove_disk( &roomq, dp );
930 /* So for some disks there is less holding diskspace available than
931 * was asked for. Possible reasons are
932 * a) diskspace has been allocated for other dumps which are
933 * still running or already being written to tape
934 * b) all other dumps have been suspended due to lack of diskspace
935 * c) this dump doesn't fit on all the holding disks
936 * Case a) is not a problem. We just wait for the diskspace to
937 * be freed by moving the current disk to a queue.
938 * If case b) occurs, we have a deadlock situation. We select
939 * a dump from the queue to be aborted and abort it. It will
940 * be retried later dumping to disk.
941 * If case c) is detected, the dump is aborted. Next time
942 * it will be dumped directly to tape. Actually, case c is a special
943 * manifestation of case b) where only one dumper is busy.
945 for(dp=NULL, dumper = dmptable; dumper < (dmptable+inparallel); dumper++) {
948 if( !find_disk(&roomq, dumper->dp) ) {
951 sched(dp)->est_size > sched(dumper->dp)->est_size ) {
956 if((dp != NULL) && (active_dumpers == 0) && (busy_dumpers > 0) &&
957 ((!taper_busy && empty(tapeq)) || degraded_mode) &&
958 pending_aborts == 0 ) { /* not case a */
959 if( busy_dumpers == 1 ) { /* case c */
960 sched(dp)->no_space = 1;
963 /* At this time, dp points to the dump with the smallest est_size.
964 * We abort that dump, hopefully not wasting too much time retrying it.
966 remove_disk( &roomq, dp );
967 chunker_cmd( sched(dp)->dumper->chunker, ABORT, NULL );
968 dumper_cmd( sched(dp)->dumper, ABORT, NULL );
975 handle_taper_result(void *cookie)
981 char *result_argv[MAX_ARGS+1];
984 assert(cookie == NULL);
990 cmd = getresult(taper, 1, &result_argc, result_argv, MAX_ARGS+1);
995 case DONE: /* DONE <handle> <label> <tape file> <err mess> */
996 if(result_argc != 5) {
997 error("error: [taper DONE result_argc != 5: %d", result_argc);
1000 dp = serial2disk(result_argv[2]);
1001 free_serial(result_argv[2]);
1003 filenum = atoi(result_argv[4]);
1005 update_info_taper(dp, result_argv[3], filenum,
1009 delete_diskspace(dp);
1011 printf("driver: finished-cmd time %s taper wrote %s:%s\n",
1012 walltime_str(curclock()), dp->host->hostname, dp->name);
1015 amfree(sched(dp)->destname);
1016 amfree(sched(dp)->dumpdate);
1017 amfree(sched(dp)->degr_dumpdate);
1018 amfree(sched(dp)->datestamp);
1025 /* continue with those dumps waiting for diskspace */
1026 continue_port_dumps();
1029 case TRYAGAIN: /* TRY-AGAIN <handle> <err mess> */
1030 if (result_argc < 2) {
1031 error("error [taper TRYAGAIN result_argc < 2: %d]",
1034 dp = serial2disk(result_argv[2]);
1035 free_serial(result_argv[2]);
1036 printf("driver: taper-tryagain time %s disk %s:%s\n",
1037 walltime_str(curclock()), dp->host->hostname, dp->name);
1040 /* See how many tapes we have left, but we alwyays
1041 retry once (why?) */
1043 if(dp->tape_splitsize > 0)
1044 avail_tapes = conf_runtapes - current_tape;
1048 if(sched(dp)->attempted > avail_tapes) {
1049 log_add(L_FAIL, "%s %s %s %d [too many taper retries]",
1050 dp->host->hostname, dp->name, sched(dp)->datestamp,
1052 printf("driver: taper failed %s %s %s, too many taper retry\n",
1053 result_argv[2], dp->host->hostname, dp->name);
1056 /* Re-insert into taper queue. */
1057 sched(dp)->attempted++;
1058 headqueue_disk(&tapeq, dp);
1061 tape_left = tape_length;
1063 /* run next thing from queue */
1068 continue_port_dumps();
1071 case SPLIT_CONTINUE: /* SPLIT_CONTINUE <handle> <new_label> */
1072 if (result_argc != 3) {
1073 error("error [taper SPLIT_CONTINUE result_argc != 3: %d]",
1078 case SPLIT_NEEDNEXT: /* SPLIT-NEEDNEXT <handle> <kb written> */
1079 if (result_argc != 3) {
1080 error("error [taper SPLIT_NEEDNEXT result_argc != 3: %d]",
1084 /* Update our tape counter and reset tape_left */
1086 tape_left = tape_length;
1088 /* Reduce the size of the dump by amount written and reduce
1089 tape_left by the amount left over */
1090 dp = serial2disk(result_argv[2]);
1091 sched(dp)->act_size -= atoi(result_argv[3]);
1092 if (sched(dp)->act_size < tape_left)
1093 tape_left -= sched(dp)->act_size;
1099 case TAPE_ERROR: /* TAPE-ERROR <handle> <err mess> */
1100 dp = serial2disk(result_argv[2]);
1101 free_serial(result_argv[2]);
1102 printf("driver: finished-cmd time %s taper wrote %s:%s\n",
1103 walltime_str(curclock()), dp->host->hostname, dp->name);
1105 log_add(L_WARNING, "Taper error: %s", result_argv[3]);
1110 log_add(L_WARNING, "Taper protocol error");
1113 * Since we've gotten a taper error, we can't send anything more
1114 * to the taper. Go into degraded mode to try to get everthing
1115 * onto disk. Later, these dumps can be flushed to a new tape.
1116 * The tape queue is zapped so that it appears empty in future
1117 * checks. If there are dumps waiting for diskspace to be freed,
1122 "going into degraded mode because of taper component error.");
1123 start_degraded_mode(&runq);
1125 tapeq.head = tapeq.tail = NULL;
1128 if(taper_ev_read != NULL) {
1129 event_release(taper_ev_read);
1130 taper_ev_read = NULL;
1132 if(cmd != TAPE_ERROR) aclose(taper);
1133 continue_port_dumps();
1137 error("driver received unexpected token (%s) from taper",
1141 * Wakeup any dumpers that are sleeping because of network
1142 * or disk constraints.
1144 start_some_dumps(&runq);
1146 } while(areads_dataready(taper));
1154 for(dumper = dmptable; dumper < dmptable+inparallel; dumper++)
1155 if(!dumper->busy && !dumper->down) return dumper;
1167 for(dumper = dmptable; dumper < dmptable+inparallel; dumper++)
1168 if(dumper->busy) n += 1;
1180 assignedhd_t **h=NULL;
1181 int activehd, i, dummy;
1185 dumper = sched(dp)->dumper;
1186 chunker = dumper->chunker;
1190 h = sched(dp)->holdp;
1191 activehd = sched(dp)->activehd;
1193 if(dumper->result == DONE && chunker->result == DONE) {
1194 update_info_dumper(dp, sched(dp)->origsize,
1195 sched(dp)->dumpsize, sched(dp)->dumptime);
1198 deallocate_bandwidth(dp->host->netif, sched(dp)->est_kps);
1200 is_partial = dumper->result != DONE || chunker->result != DONE;
1201 rename_tmp_holding(sched(dp)->destname, !is_partial);
1204 for( i = 0, h = sched(dp)->holdp; i < activehd; i++ ) {
1205 dummy += h[i]->used;
1208 size = size_holding_files(sched(dp)->destname, 0);
1209 h[activehd]->used = size - dummy;
1210 holdalloc(h[activehd]->disk)->allocated_dumpers--;
1211 adjust_diskspace(dp, DONE);
1213 sched(dp)->attempted += 1;
1215 if((dumper->result != DONE || chunker->result != DONE) &&
1216 sched(dp)->attempted <= 1) {
1217 delete_diskspace(dp);
1218 enqueue_disk(&runq, dp);
1220 else if(size > DISK_BLOCK_KB) {
1221 sched(dp)->attempted = 0;
1222 enqueue_disk(&tapeq, dp);
1226 delete_diskspace(dp);
1230 dp->host->inprogress -= 1;
1233 waitpid(chunker->pid, NULL, 0 );
1234 aclose(chunker->fd);
1239 continue_port_dumps();
1241 * Wakeup any dumpers that are sleeping because of network
1242 * or disk constraints.
1244 start_some_dumps(&runq);
1249 handle_dumper_result(cookie)
1252 /*static int pending_aborts = 0;*/
1253 dumper_t *dumper = cookie;
1257 char *result_argv[MAX_ARGS+1];
1259 assert(dumper != NULL);
1261 assert(dp != NULL && sched(dp) != NULL);
1267 cmd = getresult(dumper->fd, 1, &result_argc, result_argv, MAX_ARGS+1);
1270 /* result_argv[2] always contains the serial number */
1271 sdp = serial2disk(result_argv[2]);
1277 case DONE: /* DONE <handle> <origsize> <dumpsize> <dumptime> <errstr> */
1278 if(result_argc != 6) {
1279 error("error [dumper DONE result_argc != 6: %d]", result_argc);
1282 /*free_serial(result_argv[2]);*/
1284 sched(dp)->origsize = (long)atof(result_argv[3]);
1285 sched(dp)->dumptime = (long)atof(result_argv[5]);
1287 printf("driver: finished-cmd time %s %s dumped %s:%s\n",
1288 walltime_str(curclock()), dumper->name,
1289 dp->host->hostname, dp->name);
1292 dumper->result = cmd;
1296 case TRYAGAIN: /* TRY-AGAIN <handle> <errstr> */
1298 * Requeue this disk, and fall through to the FAILED
1301 if(sched(dp)->attempted) {
1302 log_add(L_FAIL, "%s %s %s %d [too many dumper retry: %s]",
1303 dp->host->hostname, dp->name, sched(dp)->datestamp,
1304 sched(dp)->level, result_argv[3]);
1305 printf("driver: dump failed %s %s %s, too many dumper retry: %s\n",
1306 result_argv[2], dp->host->hostname, dp->name,
1310 case FAILED: /* FAILED <handle> <errstr> */
1311 /*free_serial(result_argv[2]);*/
1312 dumper->result = cmd;
1315 case ABORT_FINISHED: /* ABORT-FINISHED <handle> */
1317 * We sent an ABORT from the NO-ROOM case because this dump
1318 * wasn't going to fit onto the holding disk. We now need to
1319 * clean up the remains of this image, and try to finish
1320 * other dumps that are waiting on disk space.
1322 assert(pending_aborts);
1323 /*free_serial(result_argv[2]);*/
1324 dumper->result = cmd;
1328 /* either EOF or garbage from dumper. Turn it off */
1329 log_add(L_WARNING, "%s pid %ld is messed up, ignoring it.\n",
1330 dumper->name, (long)dumper->pid);
1331 event_release(dumper->ev_read);
1332 dumper->ev_read = NULL;
1335 dumper->down = 1; /* mark it down so it isn't used again */
1337 /* if it was dumping something, zap it and try again */
1338 if(sched(dp)->attempted) {
1339 log_add(L_FAIL, "%s %s %s %d [%s died]",
1340 dp->host->hostname, dp->name, sched(dp)->datestamp,
1341 sched(dp)->level, dumper->name);
1344 log_add(L_WARNING, "%s died while dumping %s:%s lev %d.",
1345 dumper->name, dp->host->hostname, dp->name,
1349 dumper->result = cmd;
1355 /* send the dumper result to the chunker */
1356 if(dumper->chunker->down == 0 && dumper->chunker->fd != -1) {
1358 chunker_cmd(dumper->chunker, DONE, dp);
1361 chunker_cmd(dumper->chunker, FAILED, dp);
1365 if(dumper->result != LAST_TOK && dumper->chunker->result != LAST_TOK)
1368 } while(areads_dataready(dumper->fd));
1373 handle_chunker_result(cookie)
1376 /*static int pending_aborts = 0;*/
1377 chunker_t *chunker = cookie;
1378 assignedhd_t **h=NULL;
1383 char *result_argv[MAX_ARGS+1];
1388 assert(chunker != NULL);
1389 dumper = chunker->dumper;
1390 assert(dumper != NULL);
1393 assert(sched(dp) != NULL);
1394 assert(sched(dp)->destname != NULL);
1395 assert(dp != NULL && sched(dp) != NULL && sched(dp)->destname);
1397 if(dp && sched(dp) && sched(dp)->holdp) {
1398 h = sched(dp)->holdp;
1399 activehd = sched(dp)->activehd;
1406 cmd = getresult(chunker->fd, 1, &result_argc, result_argv, MAX_ARGS+1);
1409 /* result_argv[2] always contains the serial number */
1410 sdp = serial2disk(result_argv[2]);
1416 case PARTIAL: /* PARTIAL <handle> <dumpsize> <errstr> */
1417 case DONE: /* DONE <handle> <dumpsize> <errstr> */
1418 if(result_argc != 4) {
1419 error("error [chunker %s result_argc != 4: %d]", cmdstr[cmd],
1422 /*free_serial(result_argv[2]);*/
1424 sched(dp)->dumpsize = (long)atof(result_argv[3]);
1426 printf("driver: finished-cmd time %s %s chunked %s:%s\n",
1427 walltime_str(curclock()), chunker->name,
1428 dp->host->hostname, dp->name);
1431 event_release(chunker->ev_read);
1433 chunker->result = cmd;
1437 case TRYAGAIN: /* TRY-AGAIN <handle> <errstr> */
1439 event_release(chunker->ev_read);
1442 case FAILED: /* FAILED <handle> <errstr> */
1443 /*free_serial(result_argv[2]);*/
1445 event_release(chunker->ev_read);
1447 chunker->result = cmd;
1451 case NO_ROOM: /* NO-ROOM <handle> <missing_size> */
1452 assert( h && activehd >= 0 );
1453 h[activehd]->used -= atoi(result_argv[3]);
1454 h[activehd]->reserved -= atoi(result_argv[3]);
1455 holdalloc(h[activehd]->disk)->allocated_space -= atoi(result_argv[3]);
1456 h[activehd]->disk->disksize -= atoi(result_argv[3]);
1459 case RQ_MORE_DISK: /* RQ-MORE-DISK <handle> */
1460 assert( h && activehd >= 0 );
1461 holdalloc(h[activehd]->disk)->allocated_dumpers--;
1462 h[activehd]->used = h[activehd]->reserved;
1463 if( h[++activehd] ) { /* There's still some allocated space left.
1464 * Tell the dumper about it. */
1465 sched(dp)->activehd++;
1466 chunker_cmd( chunker, CONTINUE, dp );
1467 } else { /* !h[++activehd] - must allocate more space */
1468 sched(dp)->act_size = sched(dp)->est_size; /* not quite true */
1469 sched(dp)->est_size = sched(dp)->act_size * 21 / 20; /* +5% */
1470 sched(dp)->est_size = am_round(sched(dp)->est_size, DISK_BLOCK_KB);
1471 h = find_diskspace( sched(dp)->est_size - sched(dp)->act_size,
1475 /* No diskspace available. The reason for this will be
1476 * determined in continue_port_dumps(). */
1477 enqueue_disk( &roomq, dp );
1478 continue_port_dumps();
1480 /* OK, allocate space for disk and have chunker continue */
1481 sched(dp)->activehd = assign_holdingdisk( h, dp );
1482 chunker_cmd( chunker, CONTINUE, dp );
1488 case ABORT_FINISHED: /* ABORT-FINISHED <handle> */
1490 * We sent an ABORT from the NO-ROOM case because this dump
1491 * wasn't going to fit onto the holding disk. We now need to
1492 * clean up the remains of this image, and try to finish
1493 * other dumps that are waiting on disk space.
1495 /*assert(pending_aborts);*/
1497 /*free_serial(result_argv[2]);*/
1499 event_release(chunker->ev_read);
1501 chunker->result = cmd;
1506 /* either EOF or garbage from chunker. Turn it off */
1507 log_add(L_WARNING, "%s pid %ld is messed up, ignoring it.\n",
1508 chunker->name, (long)chunker->pid);
1511 /* if it was dumping something, zap it and try again */
1512 assert( h && activehd >= 0 );
1513 if(sched(dp)->attempted) {
1514 log_add(L_FAIL, "%s %s %s %d [%s died]",
1515 dp->host->hostname, dp->name, sched(dp)->datestamp,
1516 sched(dp)->level, chunker->name);
1519 log_add(L_WARNING, "%s died while dumping %s:%s lev %d.",
1520 chunker->name, dp->host->hostname, dp->name,
1526 event_release(chunker->ev_read);
1528 chunker->result = cmd;
1536 if(chunker->result != LAST_TOK && chunker->dumper->result != LAST_TOK)
1539 } while(areads_dataready(chunker->fd));
1550 char *hostname, *diskname, *datestamp;
1554 char *inpline = NULL;
1560 tq.head = tq.tail = NULL;
1562 for(line = 0; (inpline = agets(stdin)) != NULL; free(inpline)) {
1568 skip_whitespace(s, ch); /* find the command */
1570 error("flush line %d: syntax error (no command)", line);
1574 skip_non_whitespace(s, ch);
1577 if(strcmp(command,"ENDFLUSH") == 0) {
1581 if(strcmp(command,"FLUSH") != 0) {
1582 error("flush line %d: syntax error (%s != FLUSH)", line, command);
1586 skip_whitespace(s, ch); /* find the hostname */
1588 error("flush line %d: syntax error (no hostname)", line);
1592 skip_non_whitespace(s, ch);
1595 skip_whitespace(s, ch); /* find the diskname */
1597 error("flush line %d: syntax error (no diskname)", line);
1601 skip_non_whitespace(s, ch);
1604 skip_whitespace(s, ch); /* find the datestamp */
1606 error("flush line %d: syntax error (no datestamp)", line);
1610 skip_non_whitespace(s, ch);
1613 skip_whitespace(s, ch); /* find the level number */
1614 if(ch == '\0' || sscanf(s - 1, "%d", &level) != 1) {
1615 error("flush line %d: syntax error (bad level)", line);
1618 skip_integer(s, ch);
1620 skip_whitespace(s, ch); /* find the filename */
1622 error("flush line %d: syntax error (no filename)", line);
1626 skip_non_whitespace(s, ch);
1629 get_dumpfile(destname, &file);
1630 if( file.type != F_DUMPFILE) {
1631 if( file.type != F_CONT_DUMPFILE )
1632 log_add(L_INFO, "%s: ignoring cruft file.", destname);
1636 if(strcmp(hostname, file.name) != 0 ||
1637 strcmp(diskname, file.disk) != 0 ||
1638 strcmp(datestamp, file.datestamp) != 0) {
1639 log_add(L_INFO, "disk %s:%s not consistent with file %s",
1640 hostname, diskname, destname);
1644 dp = lookup_disk(file.name, file.disk);
1647 log_add(L_INFO, "%s: disk %s:%s not in database, skipping it.",
1648 destname, file.name, file.disk);
1652 if(file.dumplevel < 0 || file.dumplevel > 9) {
1653 log_add(L_INFO, "%s: ignoring file with bogus dump level %d.",
1654 destname, file.dumplevel);
1658 dp1 = (disk_t *)alloc(sizeof(disk_t));
1660 dp1->next = dp1->prev = NULL;
1662 /* add it to the flushhost list */
1664 flushhost = alloc(sizeof(am_host_t));
1665 flushhost->next = NULL;
1666 flushhost->hostname = stralloc("FLUSHHOST");
1667 flushhost->up = NULL;
1668 flushhost->features = NULL;
1670 dp1->hostnext = flushhost->disks;
1671 flushhost->disks = dp1;
1673 sp = (sched_t *) alloc(sizeof(sched_t));
1674 sp->destname = stralloc(destname);
1675 sp->level = file.dumplevel;
1676 sp->dumpdate = NULL;
1677 sp->degr_dumpdate = NULL;
1678 sp->datestamp = stralloc(file.datestamp);
1682 sp->degr_level = -1;
1685 sp->act_size = size_holding_files(destname, 0);
1686 sp->holdp = build_diskspace(destname);
1687 if(sp->holdp == NULL) continue;
1689 sp->timestamp = (time_t)0;
1691 dp1->up = (char *)sp;
1693 enqueue_disk(&tq, dp1);
1701 read_schedule(cookie)
1707 int level, line, priority;
1708 char *dumpdate, *degr_dumpdate;
1710 long time, degr_time;
1711 unsigned long size, degr_size;
1712 char *hostname, *features, *diskname, *datestamp, *inpline = NULL;
1716 long flush_size = 0;
1718 rq.head = rq.tail = NULL;
1720 event_release(schedule_ev_read);
1722 /* read schedule from stdin */
1724 for(line = 0; (inpline = agets(stdin)) != NULL; free(inpline)) {
1730 skip_whitespace(s, ch); /* find the command */
1732 error("schedule line %d: syntax error (no command)", line);
1736 skip_non_whitespace(s, ch);
1739 if(strcmp(command,"DUMP") != 0) {
1740 error("schedule line %d: syntax error (%s != DUMP)", line, command);
1744 skip_whitespace(s, ch); /* find the host name */
1746 error("schedule line %d: syntax error (no host name)", line);
1750 skip_non_whitespace(s, ch);
1753 skip_whitespace(s, ch); /* find the feature list */
1755 error("schedule line %d: syntax error (no feature list)", line);
1759 skip_non_whitespace(s, ch);
1762 skip_whitespace(s, ch); /* find the disk name */
1764 error("schedule line %d: syntax error (no disk name)", line);
1768 skip_non_whitespace(s, ch);
1771 skip_whitespace(s, ch); /* find the datestamp */
1773 error("schedule line %d: syntax error (no datestamp)", line);
1777 skip_non_whitespace(s, ch);
1780 skip_whitespace(s, ch); /* find the priority number */
1781 if(ch == '\0' || sscanf(s - 1, "%d", &priority) != 1) {
1782 error("schedule line %d: syntax error (bad priority)", line);
1785 skip_integer(s, ch);
1787 skip_whitespace(s, ch); /* find the level number */
1788 if(ch == '\0' || sscanf(s - 1, "%d", &level) != 1) {
1789 error("schedule line %d: syntax error (bad level)", line);
1792 skip_integer(s, ch);
1794 skip_whitespace(s, ch); /* find the dump date */
1796 error("schedule line %d: syntax error (bad dump date)", line);
1800 skip_non_whitespace(s, ch);
1803 skip_whitespace(s, ch); /* find the size number */
1804 if(ch == '\0' || sscanf(s - 1, "%lu", &size) != 1) {
1805 error("schedule line %d: syntax error (bad size)", line);
1808 skip_integer(s, ch);
1810 skip_whitespace(s, ch); /* find the time number */
1811 if(ch == '\0' || sscanf(s - 1, "%ld", &time) != 1) {
1812 error("schedule line %d: syntax error (bad estimated time)", line);
1815 skip_integer(s, ch);
1817 degr_dumpdate = NULL; /* flag if degr fields found */
1818 skip_whitespace(s, ch); /* find the degr level number */
1820 if(sscanf(s - 1, "%d", °r_level) != 1) {
1821 error("schedule line %d: syntax error (bad degr level)", line);
1824 skip_integer(s, ch);
1826 skip_whitespace(s, ch); /* find the degr dump date */
1828 error("schedule line %d: syntax error (bad degr dump date)", line);
1831 degr_dumpdate = s - 1;
1832 skip_non_whitespace(s, ch);
1835 skip_whitespace(s, ch); /* find the degr size number */
1836 if(ch == '\0' || sscanf(s - 1, "%lu", °r_size) != 1) {
1837 error("schedule line %d: syntax error (bad degr size)", line);
1840 skip_integer(s, ch);
1842 skip_whitespace(s, ch); /* find the degr time number */
1843 if(ch == '\0' || sscanf(s - 1, "%lu", °r_time) != 1) {
1844 error("schedule line %d: syntax error (bad degr estimated time)", line);
1847 skip_integer(s, ch);
1850 dp = lookup_disk(hostname, diskname);
1853 "schedule line %d: %s:%s not in disklist, ignored",
1854 line, hostname, diskname);
1858 sp = (sched_t *) alloc(sizeof(sched_t));
1860 sp->dumpdate = stralloc(dumpdate);
1861 sp->est_size = DISK_BLOCK_KB + size; /* include header */
1862 sp->est_time = time;
1863 sp->priority = priority;
1864 sp->datestamp = stralloc(datestamp);
1867 sp->degr_level = degr_level;
1868 sp->degr_dumpdate = stralloc(degr_dumpdate);
1869 sp->degr_size = DISK_BLOCK_KB + degr_size;
1870 sp->degr_time = degr_time;
1872 sp->degr_level = -1;
1873 sp->degr_dumpdate = NULL;
1879 sp->est_kps = size/time;
1881 if(sp->degr_level != -1) {
1885 sp->degr_kps = degr_size/degr_time;
1893 sp->timestamp = (time_t)0;
1894 sp->destname = NULL;
1897 dp->up = (char *) sp;
1898 if(dp->host->features == NULL) {
1899 dp->host->features = am_string_to_feature(features);
1901 remove_disk(&waitq, dp);
1902 enqueue_disk(&runq, dp);
1903 flush_size += sp->act_size;
1905 printf("driver: flush size %ld\n", flush_size);
1908 log_add(L_WARNING, "WARNING: got empty schedule from planner");
1909 if(need_degraded==1) start_degraded_mode(&runq);
1910 start_some_dumps(&runq);
1919 if (ip == (interface_t *)0) {
1923 for(p = lookup_interface(NULL); p != NULL; p = p->next) {
1924 maxusage += p->maxusage;
1925 curusage += p->curusage;
1927 res = maxusage - curusage;
1930 res = ip->maxusage - ip->curusage;
1937 interface_state(time_str)
1942 printf("driver: interface-state time %s", time_str);
1944 for(ip = lookup_interface(NULL); ip != NULL; ip = ip->next) {
1945 printf(" if %s: free %d", ip->name, free_kps(ip));
1951 allocate_bandwidth(ip, kps)
1955 ip->curusage += kps;
1959 deallocate_bandwidth(ip, kps)
1963 assert(kps <= ip->curusage);
1964 ip->curusage -= kps;
1968 static unsigned long
1972 unsigned long total_free;
1976 for(hdp = getconf_holdingdisks(); hdp != NULL; hdp = hdp->next) {
1977 diff = hdp->disksize - holdalloc(hdp)->allocated_space;
1984 static assignedhd_t **
1985 find_diskspace(size, cur_idle, pref)
1989 /* We return an array of pointers to assignedhd_t. The array contains at
1990 * most one entry per holding disk. The list of pointers is terminated by
1991 * a NULL pointer. Each entry contains a pointer to a holdingdisk and
1992 * how much diskspace to use on that disk. Later on, assign_holdingdisk
1993 * will allocate the given amount of space.
1994 * If there is not enough room on the holdingdisks, NULL is returned.
1998 assignedhd_t **result = NULL;
1999 holdingdisk_t *minp, *hdp;
2000 int i=0, num_holdingdisks=0; /* are we allowed to use the global thing? */
2003 long halloc, dalloc, hfree, dfree;
2005 size = am_round(size, DISK_BLOCK_KB);
2008 printf("%s: want %lu K\n", debug_prefix_time(": find_diskspace"), size);
2012 for(hdp = getconf_holdingdisks(); hdp != NULL; hdp = hdp->next) {
2016 used = alloc(sizeof(char) * num_holdingdisks);/*disks used during this run*/
2017 memset( used, 0, num_holdingdisks );
2018 result = alloc( sizeof(assignedhd_t *) * (num_holdingdisks+1) );
2021 while( i < num_holdingdisks && size > 0 ) {
2022 /* find the holdingdisk with the fewest active dumpers and among
2023 * those the one with the biggest free space
2025 minp = NULL; minj = -1;
2026 for(j = 0, hdp = getconf_holdingdisks(); hdp != NULL; hdp = hdp->next, j++ ) {
2027 if( pref && pref->disk == hdp && !used[j] &&
2028 holdalloc(hdp)->allocated_space <= hdp->disksize - DISK_BLOCK_KB) {
2033 else if( holdalloc(hdp)->allocated_space <= hdp->disksize - 2*DISK_BLOCK_KB &&
2036 holdalloc(hdp)->allocated_dumpers < holdalloc(minp)->allocated_dumpers ||
2037 (holdalloc(hdp)->allocated_dumpers == holdalloc(minp)->allocated_dumpers &&
2038 hdp->disksize-holdalloc(hdp)->allocated_space > minp->disksize-holdalloc(minp)->allocated_space)) ) {
2045 if( !minp ) { break; } /* all holding disks are full */
2048 /* hfree = free space on the disk */
2049 hfree = minp->disksize - holdalloc(minp)->allocated_space;
2051 /* dfree = free space for data, remove 1 header for each chunksize */
2052 dfree = hfree - (((hfree-1)/minp->chunksize)+1) * DISK_BLOCK_KB;
2054 /* dalloc = space I can allocate for data */
2055 dalloc = ( dfree < size ) ? dfree : size;
2057 /* halloc = space to allocate, including 1 header for each chunksize */
2058 halloc = dalloc + (((dalloc-1)/minp->chunksize)+1) * DISK_BLOCK_KB;
2061 printf("%s: find diskspace: size %ld hf %ld df %ld da %ld ha %ld\n",
2062 debug_prefix_time(": find_diskspace"),
2063 size, hfree, dfree, dalloc, halloc);
2067 result[i] = alloc(sizeof(assignedhd_t));
2068 result[i]->disk = minp;
2069 result[i]->reserved = halloc;
2070 result[i]->used = 0;
2071 result[i]->destname = NULL;
2074 } /* while i < num_holdingdisks && size > 0 */
2077 if( size ) { /* not enough space available */
2078 printf("find diskspace: not enough diskspace. Left with %lu K\n", size);
2080 free_assignedhd(result);
2085 for( i = 0; result && result[i]; i++ ) {
2086 printf("%s: find diskspace: selected %s free %ld reserved %ld dumpers %d\n",
2087 debug_prefix_time(": find_diskspace"),
2088 result[i]->disk->diskdir,
2089 result[i]->disk->disksize - holdalloc(result[i]->disk)->allocated_space,
2090 result[i]->reserved,
2091 holdalloc(result[i]->disk)->allocated_dumpers);
2100 assign_holdingdisk(holdp, diskp)
2101 assignedhd_t **holdp;
2106 char *sfn = sanitise_filename(diskp->name);
2108 assignedhd_t **new_holdp;
2110 snprintf( lvl, sizeof(lvl), "%d", sched(diskp)->level );
2112 size = am_round(sched(diskp)->est_size - sched(diskp)->act_size,
2115 for( c = 0; holdp[c]; c++ ); /* count number of disks */
2117 /* allocate memory for sched(diskp)->holdp */
2118 for(j = 0; sched(diskp)->holdp && sched(diskp)->holdp[j]; j++) {}
2119 new_holdp = (assignedhd_t **)alloc(sizeof(assignedhd_t*)*(j+c+1));
2120 if (sched(diskp)->holdp) {
2121 memcpy(new_holdp, sched(diskp)->holdp, j * sizeof(*new_holdp));
2122 amfree(sched(diskp)->holdp);
2124 sched(diskp)->holdp = new_holdp;
2128 if( j > 0 ) { /* This is a request for additional diskspace. See if we can
2129 * merge assignedhd_t's */
2131 if( sched(diskp)->holdp[j-1]->disk == holdp[0]->disk ) { /* Yes! */
2132 sched(diskp)->holdp[j-1]->reserved += holdp[0]->reserved;
2133 holdalloc(holdp[0]->disk)->allocated_space += holdp[0]->reserved;
2134 size = (holdp[0]->reserved>size) ? 0 : size-holdp[0]->reserved;
2136 printf("%s: merging holding disk %s to disk %s:%s, add %lu for reserved %lu, left %lu\n",
2137 debug_prefix_time(": assign_holdingdisk"),
2138 sched(diskp)->holdp[j-1]->disk->diskdir,
2139 diskp->host->hostname, diskp->name,
2140 holdp[0]->reserved, sched(diskp)->holdp[j-1]->reserved,
2150 /* copy assignedhd_s to sched(diskp), adjust allocated_space */
2151 for( ; holdp[i]; i++ ) {
2152 holdp[i]->destname = newvstralloc( holdp[i]->destname,
2153 holdp[i]->disk->diskdir, "/",
2155 diskp->host->hostname, ".",
2158 sched(diskp)->holdp[j++] = holdp[i];
2159 holdalloc(holdp[i]->disk)->allocated_space += holdp[i]->reserved;
2160 size = (holdp[i]->reserved>size) ? 0 : size-holdp[i]->reserved;
2162 printf("%s: %d assigning holding disk %s to disk %s:%s, reserved %lu, left %lu\n",
2163 debug_prefix_time(": assign_holdingdisk"),
2164 i, holdp[i]->disk->diskdir, diskp->host->hostname, diskp->name,
2165 holdp[i]->reserved, size );
2168 holdp[i] = NULL; /* so it doesn't get free()d... */
2170 sched(diskp)->holdp[j] = NULL;
2177 adjust_diskspace(diskp, cmd)
2181 assignedhd_t **holdp;
2182 unsigned long total=0;
2187 printf("%s: %s:%s %s\n",
2188 debug_prefix_time(": adjust_diskspace"),
2189 diskp->host->hostname, diskp->name, sched(diskp)->destname);
2193 holdp = sched(diskp)->holdp;
2197 for( i = 0; holdp[i]; i++ ) { /* for each allocated disk */
2198 diff = holdp[i]->used - holdp[i]->reserved;
2199 total += holdp[i]->used;
2200 holdalloc(holdp[i]->disk)->allocated_space += diff;
2203 printf("%s: hdisk %s done, reserved %ld used %ld diff %ld alloc %ld dumpers %d\n",
2204 debug_prefix_time(": adjust_diskspace"),
2205 holdp[i]->disk->name, holdp[i]->reserved, holdp[i]->used, diff,
2206 holdalloc(holdp[i]->disk)->allocated_space,
2207 holdalloc(holdp[i]->disk)->allocated_dumpers );
2210 holdp[i]->reserved += diff;
2213 sched(diskp)->act_size = total;
2216 printf("%s: after: disk %s:%s used %ld\n",
2217 debug_prefix_time(": adjust_diskspace"),
2218 diskp->host->hostname, diskp->name, sched(diskp)->act_size );
2225 delete_diskspace(diskp)
2228 assignedhd_t **holdp;
2231 holdp = sched(diskp)->holdp;
2235 for( i = 0; holdp[i]; i++ ) { /* for each disk */
2236 /* find all files of this dump on that disk, and subtract their
2237 * reserved sizes from the disk's allocated space
2239 holdalloc(holdp[i]->disk)->allocated_space -= holdp[i]->used;
2242 unlink_holding_files(holdp[0]->destname); /* no need for the entire list,
2243 * because unlink_holding_files
2244 * will walk through all files
2245 * using cont_filename */
2246 free_assignedhd(sched(diskp)->holdp);
2247 sched(diskp)->holdp = NULL;
2248 sched(diskp)->act_size = 0;
2251 static assignedhd_t **build_diskspace(destname)
2257 char buffer[DISK_BLOCK_BYTES];
2259 assignedhd_t **result;
2262 int num_holdingdisks=0;
2263 char dirname[1000], *ch;
2265 char *filename = destname;
2267 for(hdp = getconf_holdingdisks(); hdp != NULL; hdp = hdp->next) {
2270 used = alloc(sizeof(int) * num_holdingdisks);
2271 for(i=0;i<num_holdingdisks;i++)
2273 result = alloc( sizeof(assignedhd_t *) * (num_holdingdisks+1) );
2275 while(filename != NULL && filename[0] != '\0') {
2276 strncpy(dirname, filename, 999);
2278 ch = strrchr(dirname,'/');
2280 ch = strrchr(dirname,'/');
2283 for(j = 0, hdp = getconf_holdingdisks(); hdp != NULL;
2284 hdp = hdp->next, j++ ) {
2285 if(strcmp(dirname,hdp->diskdir)==0) {
2290 if(stat(filename, &finfo) == -1) {
2291 fprintf(stderr, "stat %s: %s\n", filename, strerror(errno));
2294 used[j] += (finfo.st_size+1023)/1024;
2295 if((fd = open(filename,O_RDONLY)) == -1) {
2296 fprintf(stderr,"build_diskspace: open of %s failed: %s\n",
2297 filename, strerror(errno));
2300 if ((buflen = fullread(fd, buffer, sizeof(buffer))) > 0) {;
2301 parse_file_header(buffer, &file, buflen);
2304 filename = file.cont_filename;
2307 for(j = 0, i=0, hdp = getconf_holdingdisks(); hdp != NULL;
2308 hdp = hdp->next, j++ ) {
2310 result[i] = alloc(sizeof(assignedhd_t));
2311 result[i]->disk = hdp;
2312 result[i]->reserved = used[j];
2313 result[i]->used = used[j];
2314 result[i]->destname = stralloc(destname);
2325 holdingdisk_state(time_str)
2332 printf("driver: hdisk-state time %s", time_str);
2334 for(hdp = getconf_holdingdisks(), dsk = 0; hdp != NULL; hdp = hdp->next, dsk++) {
2335 diff = hdp->disksize - holdalloc(hdp)->allocated_space;
2336 printf(" hdisk %d: free %ld dumpers %d", dsk, diff,
2337 holdalloc(hdp)->allocated_dumpers);
2343 update_failed_dump_to_tape(dp)
2347 * should simply set no_bump
2350 time_t save_timestamp = sched(dp)->timestamp;
2351 /* setting timestamp to 0 removes the current level from the
2352 * database, so that we ensure that it will not be bumped to the
2353 * next level on the next run. If we didn't do this, dumpdates or
2354 * gnutar-lists might have been updated already, and a bumped
2355 * incremental might be created. */
2356 sched(dp)->timestamp = 0;
2357 update_info_dumper(dp, -1, -1, -1);
2358 sched(dp)->timestamp = save_timestamp;
2361 /* ------------------- */
2375 char *result_argv[MAX_ARGS+1];
2376 int dumper_tryagain = 0;
2378 printf("driver: dumping %s:%s directly to tape\n",
2379 dp->host->hostname, dp->name);
2382 /* pick a dumper and fail if there are no idle dumpers */
2384 dumper = idle_dumper();
2386 printf("driver: no idle dumpers for %s:%s.\n",
2387 dp->host->hostname, dp->name);
2389 log_add(L_WARNING, "no idle dumpers for %s:%s.\n",
2390 dp->host->hostname, dp->name);
2391 return 2; /* fatal problem */
2394 /* tell the taper to read from a port number of its choice */
2396 taper_cmd(PORT_WRITE, dp, NULL, sched(dp)->level, sched(dp)->datestamp);
2397 cmd = getresult(taper, 1, &result_argc, result_argv, MAX_ARGS+1);
2399 printf("driver: did not get PORT from taper for %s:%s\n",
2400 dp->host->hostname, dp->name);
2402 return 2; /* fatal problem */
2404 /* copy port number */
2405 dumper->output_port = atoi(result_argv[2]);
2407 /* tell the dumper to dump to a port */
2409 dumper_cmd(dumper, PORT_DUMP, dp);
2410 dp->host->start_t = time(NULL) + 15;
2412 /* update statistics & print state */
2414 taper_busy = dumper->busy = 1;
2415 dp->host->inprogress += 1;
2417 sched(dp)->timestamp = time((time_t *)0);
2418 allocate_bandwidth(dp->host->netif, sched(dp)->est_kps);
2419 idle_reason = NOT_IDLE;
2423 /* wait for result from dumper */
2425 cmd = getresult(dumper->fd, 1, &result_argc, result_argv, MAX_ARGS+1);
2428 free_serial(result_argv[2]);
2432 /* either eof or garbage from dumper */
2433 log_add(L_WARNING, "%s pid %ld is messed up, ignoring it.\n",
2434 dumper->name, (long)dumper->pid);
2435 dumper->down = 1; /* mark it down so it isn't used again */
2436 failed = 1; /* dump failed, must still finish up with taper */
2439 case DONE: /* DONE <handle> <origsize> <dumpsize> <dumptime> <errstr> */
2440 /* everything went fine */
2441 origsize = (long)atof(result_argv[3]);
2442 /*dumpsize = (long)atof(result_argv[4]);*/
2443 dumptime = (long)atof(result_argv[5]);
2446 case NO_ROOM: /* NO-ROOM <handle> */
2447 dumper_cmd(dumper, ABORT, dp);
2448 cmd = getresult(dumper->fd, 1, &result_argc, result_argv, MAX_ARGS+1);
2450 free_serial(result_argv[2]);
2451 assert(cmd == ABORT_FINISHED);
2453 case TRYAGAIN: /* TRY-AGAIN <handle> <errstr> */
2455 /* dump failed, but we must still finish up with taper */
2456 /* problem with dump, possibly nonfatal, retry one time */
2457 sched(dp)->attempted++;
2458 failed = sched(dp)->attempted;
2459 dumper_tryagain = 1;
2462 case FAILED: /* FAILED <handle> <errstr> */
2463 /* dump failed, but we must still finish up with taper */
2464 failed = 2; /* fatal problem with dump */
2469 * Note that at this point, even if the dump above failed, it may
2470 * not be a fatal failure if taper below says we can try again.
2471 * E.g. a dumper failure above may actually be the result of a
2472 * tape overflow, which in turn causes dump to see "broken pipe",
2473 * "no space on device", etc., since taper closed the port first.
2478 cmd = getresult(taper, 1, &result_argc, result_argv, MAX_ARGS+1);
2482 case DONE: /* DONE <handle> <label> <tape file> <err mess> */
2483 if(result_argc != 5) {
2484 error("error [dump to tape DONE result_argc != 5: %d]", result_argc);
2487 if(failed == 1) goto tryagain; /* dump didn't work */
2488 else if(failed == 2) goto failed_dumper;
2490 free_serial(result_argv[2]);
2492 sscanf(result_argv[5],"[sec %f kb %ld ", &tapetime, &dumpsize);
2495 /* every thing went fine */
2496 update_info_dumper(dp, origsize, dumpsize, dumptime);
2497 filenum = atoi(result_argv[4]);
2498 update_info_taper(dp, result_argv[3], filenum, sched(dp)->level);
2499 /* note that update_info_dumper() must be run before
2500 update_info_taper(), since update_info_dumper overwrites
2501 tape information. */
2506 case TRYAGAIN: /* TRY-AGAIN <handle> <err mess> */
2507 tape_left = tape_length;
2509 if(dumper_tryagain == 0) {
2510 sched(dp)->attempted++;
2511 if(sched(dp)->attempted > failed)
2512 failed = sched(dp)->attempted;
2516 headqueue_disk(&runq, dp);
2518 update_failed_dump_to_tape(dp);
2519 free_serial(result_argv[2]);
2522 case SPLIT_CONTINUE: /* SPLIT_CONTINUE <handle> <new_label> */
2523 if (result_argc != 3) {
2524 error("error [taper SPLIT_CONTINUE result_argc != 3: %d]", result_argc);
2526 fprintf(stderr, "driver: Got SPLIT_CONTINUE %s %s\n", result_argv[2], result_argv[3]);
2528 goto continue_port_dump;
2530 case SPLIT_NEEDNEXT:
2531 fprintf(stderr, "driver: Got SPLIT_NEEDNEXT %s %s\n", result_argv[2], result_argv[3]);
2533 goto continue_port_dump;
2535 case TAPE_ERROR: /* TAPE-ERROR <handle> <err mess> */
2538 update_failed_dump_to_tape(dp);
2539 free_serial(result_argv[2]);
2540 failed = 2; /* fatal problem */
2541 start_degraded_mode(&runq);
2544 /* reset statistics & return */
2546 taper_busy = dumper->busy = 0;
2547 dp->host->inprogress -= 1;
2549 deallocate_bandwidth(dp->host->netif, sched(dp)->est_kps);
2561 for(len = 0, p = q.head; p != NULL; len++, p = p->next);
2571 wall_time = walltime_str(curclock());
2573 printf("driver: state time %s ", wall_time);
2574 printf("free kps: %d space: %lu taper: ",
2575 free_kps((interface_t *)0), free_space());
2576 if(degraded_mode) printf("DOWN");
2577 else if(!taper_busy) printf("idle");
2578 else printf("writing");
2580 for(i = 0; i < inparallel; i++) if(!dmptable[i].busy) nidle++;
2581 printf(" idle-dumpers: %d", nidle);
2582 printf(" qlen tapeq: %d", queue_length(tapeq));
2583 printf(" runq: %d", queue_length(runq));
2584 printf(" roomq: %d", queue_length(roomq));
2585 printf(" wakeup: %d", (int)sleep_time);
2586 printf(" driver-idle: %s\n", idle_strings[idle_reason]);
2587 interface_state(wall_time);
2588 holdingdisk_state(wall_time);
2600 printf("================\n");
2601 printf("driver state at time %s: %s\n", walltime_str(curclock()), str);
2602 printf("free kps: %d, space: %lu\n", free_kps((interface_t *)0), free_space());
2603 if(degraded_mode) printf("taper: DOWN\n");
2604 else if(!taper_busy) printf("taper: idle\n");
2605 else printf("taper: writing %s:%s.%d est size %lu\n",
2606 taper_disk->host->hostname, taper_disk->name,
2607 sched(taper_disk)->level,
2608 sched(taper_disk)->est_size);
2609 for(i = 0; i < inparallel; i++) {
2610 dp = dmptable[i].dp;
2611 if(!dmptable[i].busy)
2612 printf("%s: idle\n", dmptable[i].name);
2614 printf("%s: dumping %s:%s.%d est kps %d size %lu time %ld\n",
2615 dmptable[i].name, dp->host->hostname, dp->name, sched(dp)->level,
2616 sched(dp)->est_kps, sched(dp)->est_size, sched(dp)->est_time);
2618 dump_queue("TAPE", tapeq, 5, stdout);
2619 dump_queue("ROOM", roomq, 5, stdout);
2620 dump_queue("RUN ", runq, 5, stdout);
2621 printf("================\n");