Merge branch 'master' into squeeze

[debian/amanda] / perl / Amanda / Recovery / Planner.pm
diff --git a/perl/Amanda/Recovery/Planner.pm b/perl/Amanda/Recovery/Planner.pm

new file mode 100644 (file)

index 0000000..a310062
--- /dev/null
+++ b/perl/Amanda/Recovery/Planner.pm
@@ -0,0 +1,526 @@
+# Copyright (c) 2010 Zmanda, Inc.  All Rights Reserved.
+#
+# This library is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License version 2.1 as
+# published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this library; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA.
+#
+# Contact information: Zmanda Inc., 465 S. Mathilda Ave., Suite 300
+# Sunnyvale, CA 94086, USA, or: http://www.zmanda.com
+
+=head1 NAME
+
+Amanda::Recovery::Planner - use the catalog to plan recoveries
+
+=head1 SYNOPSIS
+
+    my $plan;
+
+    $subs{'make_plan'} = make_cb(make_plan => sub {
+       Amanda::Recovery::Planner::make_plan(
+           dumpspecs => [ $ds1, $ds2 ],
+           algorithm => $algo,
+           changer => $changer,
+           plan_cb => $subs{'plan_cb'});
+    };
+
+    $subs{'plan_cb'} = make_cb(plan_cb => sub {
+       my ($err, $pl) = @_;
+       die $err if $err;
+
+       $plan = $pl;
+       $subs{'start_next_dumpfile'}->();
+    });
+
+    $subs{'start_next_dumpfile'} = make_cb(start_next_dumpfile => sub {
+       my $dump = shift @{$plan->{'dumps'}};
+       if (!$dump) {
+           # .. all done!
+       }
+
+       print "recovering ", $dump->{'hostname'}, " ", $dump->{'diskname'}, "\n";
+       $clerk->get_xfer_src( .. dump => $dump .. );
+       # ..
+    });
+
+=head1 OVERVIEW
+
+This package determines the optimal way to recover dump files from storage.
+Its function is superficially fairly simple: given a collection of desired
+dumpfiles, it returns a Plan to recover those dumpfiles, specifying exactly the
+volumes and files that are needed, and the order in which they should be
+accesed.
+
+=head2 ALGORITHMS
+
+Several algorithms will soon be available for selecting volumes when a dumpfile
+appears in several places (e.g., from an amvault operation).  At the moment,
+the algorithm argument should be omitted, as this will eventually indicate that
+the user-configured algorithm should be applied.
+
+=head2 INSTANTIATING A PLAN
+
+For most purposes, you should call C<make_plan> with the desired dumpspecs, a
+changer, and a callback:
+
+    Amanda::Recovery::Planner::make_plan(
+       dumpspecs => [ $ds1, $ds2, .. ],
+       changer => $chg,
+       plan_cb => $plan_cb);
+
+As a shortcut, you may also specify a single dumpspec:
+
+    Amanda::Recovery::Planner::make_plan(
+       dumpspec => $ds,
+       changer => $chg,
+       plan_cb => $plan_cb);
+
+Note that in this case, the resulting plan may contain more than one dump, if
+the dumpspec was not unambiguous.
+
+To select the planner algorithm, pass an C<algorithm> argument.  This argument
+is currently ignored and should be omitted.  If the optional argument C<debug>
+is given with a true value, then the Planner will log additional debug
+information to the Amanda debug logs.  Debugging is automatically enabled if
+the C<DEBUG_RECOVERY> configuration parameter is set to anything greater than
+1.
+
+The optional argument C<one_dump_per_part> will create a "no-reassembly" plan,
+where each part appears as the only part in a unique dump.  The dump objects
+will have the key C<single_part> set to 1.
+
+The C<plan_cb> is called with two arguments:
+
+    $plan_cb->($err, $plan);
+
+If C<$err> is defined, it describes an error that occurred; otherwise, C<$plan>
+is the generated plan, as described below.
+
+Some algorithms may consult the changer's inventory to determine what volumes
+are available.  It is because of this asynchronous operation that C<make_plan>
+takes a callback instead of simply returning the plan.
+
+=head3 Pre-defined Plans
+
+In some cases, you already know exactly where the data is, and just need a
+proper plan object to hand to L<Amanda::Recovery::Clerk>.  One such case is a
+recovery from a holding file.  In this case, use C<make_plan> like this:
+
+    Amanda::Recovery::Planner::make_plan(
+       holding_file => $hf,
+       dumpspec => $ds,
+       plan_cb => $plan_cb);
+
+This will create a plan to recover the data in C<$fh>.  The dumpspec is
+optional, but if present will be used to verify that the holding file contains
+the appropriate dump.
+
+Similarly, if you have a list of label:fileno pairs to use, call C<make_plan>
+like this:
+
+    Amanda::Recovery::Planner::make_plan(
+       filelist => [
+           $label => [ $filenum, $filenum, .. ],
+           $label => ..
+       ],
+       dumpspec => $ds,
+       plan_cb => $plan_cb);
+
+This will verify the requested files against the catalog and the dumpspec, then
+hand back a plan that essentially embodies C<filelist>.
+
+Note that both of these functions will only create a single-dump plan.
+
+=head2 PLANS
+
+A Plan is a perl object describing the process for recovering zero or more
+dumpfiles.  Its principal components are dumps, in order, that are to be
+recovered, but the object presents some other interfaces that return useful
+information about the plan.
+
+The C<'dumps'> key holds the list of dumps, in the order they should be
+performed.  Callers should shift dumps off this list to present to the Clerk.
+
+To get a list of volumes that the plan requires, in order, use
+C<get_volume_list>.  Each volume is represented as a hash:
+
+  { label => 'DATA182', available => 1 }
+
+where C<available> is false if the planner did not find this volume in the
+changer.  Planners which do not consult the changer will have a false value for
+C<available>.
+
+Similarly, to get a list of holding files that the plan requires, in order, use
+C<get_holding_file_list>.  Each file is represented as a string giving the
+fully qualified pathname.
+
+=cut
+
+package Amanda::Recovery::Planner;
+
+use strict;
+use warnings;
+use Carp;
+
+sub make_plan {
+    my %params = @_;
+
+    $params{'dumpspecs'} = [ $params{'dumpspec'} ]
+       if exists $params{'dumpspec'};
+
+    my $plan = Amanda::Recovery::Planner::Plan->new({
+       algo => $params{'algorithm'},
+       chg => $params{'changer'},
+       debug => $params{'debug'},
+       one_dump_per_part => $params{'one_dump_per_part'},
+    });
+
+    if (exists $params{'holding_file'}) {
+       $plan->make_holding_plan(%params);
+    } elsif (exists $params{'filelist'}) {
+       $plan->make_plan_from_filelist(%params);
+    } else {
+       $plan->make_plan(%params);
+    }
+}
+
+package Amanda::Recovery::Planner::Plan;
+
+use strict;
+use warnings;
+use Data::Dumper;
+use Carp;
+
+use Amanda::Device qw( :constants );
+use Amanda::Holding;
+use Amanda::Header;
+use Amanda::Config qw( :getconf config_dir_relative );
+use Amanda::Debug qw( :logging );
+use Amanda::MainLoop;
+use Amanda::DB::Catalog;
+use Amanda::Tapelist;
+
+sub new {
+    my $class = shift;
+    my $self = shift;
+
+    $self->{'debug'} = $Amanda::Config::debug_recovery
+       if not defined $self->{'debug'}
+           or $Amanda::Config::debug_recovery > $self->{'debug'};
+
+    return bless($self, $class);
+}
+
+sub shift_dump {
+    my $self = shift;
+    return shift @{$self->{'dumps'}};
+}
+
+sub make_plan {
+    my $self = shift;
+    my %params = @_;
+
+    for my $rq_param (qw(changer plan_cb dumpspecs)) {
+       croak "required parameter '$rq_param' mising"
+           unless exists $params{$rq_param};
+    }
+    my $dumpspecs = $params{'dumpspecs'};
+
+    # first, get the set of dumps that match these dumpspecs
+    my @dumps = Amanda::DB::Catalog::get_dumps(dumpspecs => $dumpspecs);
+
+    # now "bin" those by host/disk/dump_ts/level
+    my %dumps;
+    for my $dump (@dumps) {
+       my $k = join("\0", $dump->{'hostname'}, $dump->{'diskname'},
+                          $dump->{'dump_timestamp'}, $dump->{'level'});
+       $dumps{$k} = [] unless exists $dumps{$k};
+       push @{$dumps{$k}}, $dump;
+    }
+
+    # now select the "best" of each set of dumps, and put that in @dumps
+    @dumps = ();
+    for my $options (values %dumps) {
+       my @options = @$options;
+       # if there's only one option, the choice is easy
+       if (@options == 1) {
+           push @dumps, $options[0];
+           next;
+       }
+
+       # if there are several, narrow to those with an OK status or barring that,
+       # those with a PARTIAL status.  FAIL need not apply.
+       my @ok_options = grep { $_->{'status'} eq 'OK' } @options;
+       my @partial_options = grep { $_->{'status'} eq 'PARTIAL' } @options;
+
+       if (@ok_options) {
+           @options = @ok_options;
+       } else {
+           @options = @partial_options;
+       }
+
+       # now, take the one written longest ago - this gets us the dump on secondary
+       # media if it hasn't been overwritten, otherwise the dump on tertiary media,
+       # etc.  Note that this also prefers dumps on holding disk, since they are
+       # tagged with a write_timestamp of 0
+       @options = Amanda::DB::Catalog::sort_dumps(['write_timestamp'], @options);
+       push @dumps, $options[0];
+    }
+
+    # at this point we have exactly one instance of each dump in @dumps.
+
+    # If one_dump_per_part was specified, rearrange @dumps to have a distinct
+    # dump object for each part.
+    if ($self->{'one_dump_per_part'}) {
+       @dumps = $self->split_dumps_per_part(\@dumps);
+    }
+
+    # now sort the dumps in order by their constituent parts.  This sorts based
+    # on write_timestamp, then on the label of the first part of the dump,
+    # using the tapelist to order the labels.  Where labels match, it sorts on
+    # the part's filenum.  This should sort the dumps into the order in which
+    # they were written, with holding dumps coming in at the head of the list.
+    my $tapelist_filename = config_dir_relative(getconf($CNF_TAPELIST));
+    my $tapelist = Amanda::Tapelist->new($tapelist_filename);
+
+    my $sortfn = sub {
+       my $rv;
+       my $tle;
+
+       return $rv
+           if ($rv = $a->{'write_timestamp'} cmp $b->{'write_timestamp'});
+
+       # above will take care of comparing a holding dump to an on-media dump, but
+       # if both are on holding then we need to compare them lexically
+       if (exists $a->{'parts'}[1]{'holding_file'}
+       and exists $b->{'parts'}[1]{'holding_file'}) {
+           return $a->{'parts'}[1]{'holding_file'} cmp $b->{'parts'}[1]{'holding_file'};
+       }
+
+       my ($alabel, $blabel) = (
+           $a->{'parts'}[1]{'label'},
+           $b->{'parts'}[1]{'label'},
+       );
+
+       my ($apos, $bpos);
+       $apos = $tle->{'position'}
+           if (($tle = $tapelist->lookup_tapelabel($alabel)));
+       $bpos = $tle->{'position'}
+           if (($tle = $tapelist->lookup_tapelabel($blabel)));
+       return ($bpos <=> $apos) # not: reversed for "oldest to newest"
+           if defined $bpos && defined $apos && ($bpos <=> $apos);
+
+       # if a tape wasn't in the tapelist, just sort the labels lexically (this
+       # really shouldn't happen)
+       if (!defined $bpos || !defined $apos) {
+           return $alabel cmp $blabel
+               if defined $alabel and defined $blabel and $alabel cmp $blabel ;
+       }
+
+       # finally, the dumps are on the same volume, so just sort by filenum
+       return $a->{'parts'}[1]{'filenum'} <=> $b->{'parts'}[1]{'filenum'};
+    };
+    @dumps = sort $sortfn @dumps;
+
+    $self->{'dumps'} = \@dumps;
+
+    Amanda::MainLoop::call_later($params{'plan_cb'}, undef, $self);
+}
+
+sub make_holding_plan {
+    my $self = shift;
+    my %params = @_;
+
+    for my $rq_param (qw(holding_file plan_cb)) {
+       croak "required parameter '$rq_param' mising"
+           unless exists $params{$rq_param};
+    }
+
+    # This is a little tricky.  The idea is to open up the holding file and
+    # read its header, then find that dump in the catalog.  This may seem like
+    # the long way around, but it adds an extra layer of security to the
+    # recovery process, as it prevents recovery from arbitrary files on the
+    # filesystem that are not under a recognized holding directory.
+
+    my $hdr = Amanda::Holding::get_header($params{'holding_file'});
+    if (!$hdr or $hdr->{'type'} != $Amanda::Header::F_DUMPFILE) {
+       return $params{'plan_cb'}->(
+               "could not open '$params{holding_file}': missing or not a holding file");
+    }
+
+    # look up this holding file in the catalog, adding the dumpspec we were
+    # given so that get_dumps will compare against it for us.
+    my $dump_timestamp = $hdr->{'datestamp'};
+    my $hostname = $hdr->{'name'};
+    my $diskname = $hdr->{'disk'};
+    my $level = $hdr->{'dumplevel'};
+    my @dumps = Amanda::DB::Catalog::get_dumps(
+           $params{'dumpspec'}? (dumpspecs => [ $params{'dumpspec'} ]) : (),
+           dump_timestamp => $dump_timestamp,
+           hostname => $hostname,
+           diskname => $diskname,
+           level => $level,
+           holding => 1,
+       );
+
+    if (!@dumps) {
+       return $params{'plan_cb'}->(
+               "Specified holding file does not match dumpspec");
+    }
+
+    # this would be weird..
+    $self->dbg("got multiple dumps from Amanda::DB::Catalog for a holding file!")
+       if (@dumps > 1);
+
+    # arbitrarily keepy the first dump if we got several
+    $self->{'dumps'} = [ $dumps[0] ];
+
+    Amanda::MainLoop::call_later($params{'plan_cb'}, undef, $self);
+}
+
+sub make_plan_from_filelist {
+    my $self = shift;
+    my %params = @_;
+
+    for my $rq_param (qw(filelist plan_cb)) {
+       croak "required parameter '$rq_param' mising"
+           unless exists $params{$rq_param};
+    }
+
+    # This is similarly tricky - in this case, we search for dumps matching
+    # both the dumpspec and the labels, filter that down to just the parts we
+    # want, and then check that only one dump remains.  Then we look up that
+    # dump.
+
+    my @labels;
+    my %files;
+    my @filelist = @{$params{'filelist'}};
+    while (@filelist) {
+       my $label = shift @filelist;
+       push @labels, $label;
+       $files{$label} = shift @filelist;
+    }
+
+    my @parts = Amanda::DB::Catalog::get_parts(
+           $params{'dumpspec'}? (dumpspecs => [ $params{'dumpspec'} ]) : (),
+           labels => [ @labels ]);
+
+    # filter down to the parts that match filelist (using %files)
+    @parts = grep {
+       my $filenum = $_->{'filenum'};
+       grep { $_ == $filenum } @{$files{$_->{'label'}}};
+    } @parts;
+
+    # extract the dumps, using a hash (on the perl identity of the dump) to
+    # ensure uniqueness
+    my %dumps = map { my $d = $_->{'dump'}; ($d, $d) } @parts;
+    my @dumps = values %dumps;
+
+    if (!@dumps) {
+       return $params{'plan_cb'}->(
+               "Specified file list does not match dumpspec");
+    } elsif (@dumps > 1) {
+       return $params{'plan_cb'}->(
+               "Specified file list matches multiple dumps; cannot continue recovery");
+    }
+
+    # now, because of the weak linking used by Amanda::DB::Catalog, we need to
+    # re-query for this dump.  If we don't do this, the parts will all be
+    # garbage-collected when we hand back the plan.  This is, chartiably, "less than
+    # ideal".  Note that this has the side-effect of filling in any parts of the
+    # dump that were missing from the filelist.
+    @dumps = Amanda::DB::Catalog::get_dumps(
+       hostname => $dumps[0]->{'hostname'},
+       diskname => $dumps[0]->{'diskname'},
+       level => $dumps[0]->{'level'},
+       dump_timestamp => $dumps[0]->{'dump_timestamp'},
+       write_timestamp => $dumps[0]->{'write_timestamp'},
+       dumpspecs => $params{'dumpspecs'});
+
+    # sanity check
+    die unless @dumps;
+    $self->{'dumps'} = [ $dumps[0] ];
+
+    Amanda::MainLoop::call_later($params{'plan_cb'}, undef, $self);
+}
+
+sub split_dumps_per_part {
+    my $self = shift;
+    my ($dumps) = @_;
+
+    my @new_dumps;
+
+    for my $dump (@$dumps) {
+       for my $part (@{$dump->{'parts'}}) {
+           my ($newdump, $newpart);
+
+           # skip part 0
+           next unless defined $part;
+
+           # shallow copy the dump and part objects
+           $newdump = do { my %t = %$dump; \%t; };
+           $newpart = do { my %t = %$part; \%t; };
+
+           # overwrite the interlinking
+           $newpart->{'dump'} = $newdump;
+           $newdump->{'parts'} = [ undef, $newpart ];
+
+           $newdump->{'single_part'} = 1;
+
+           push @new_dumps, $newdump;
+       }
+    }
+
+    return @new_dumps;
+}
+
+sub get_volume_list {
+    my $self = shift;
+    my $last_label;
+    my @volumes;
+
+    for my $dump (@{$self->{'dumps'}}) {
+       for my $part (@{$dump->{'parts'}}) {
+           next unless defined $part; # skip parts[0]
+           next unless defined $part->{'label'}; # skip holding parts
+           if (!defined $last_label || $part->{'label'} ne $last_label) {
+               $last_label = $part->{'label'};
+               push @volumes, { label => $last_label, available => 0 };
+           }
+       }
+    }
+
+    return @volumes;
+}
+
+sub get_holding_file_list {
+    my $self = shift;
+    my @hfiles;
+
+    for my $dump (@{$self->{'dumps'}}) {
+       for my $part (@{$dump->{'parts'}}) {
+           next unless defined $part; # skip parts[0]
+           next unless defined $part->{'holding_file'}; # skip on-media dumps
+           push @hfiles, $part->{'holding_file'};
+       }
+    }
+
+    return @hfiles;
+}
+
+sub dbg {
+    my ($self, $msg) = @_;
+    if ($self->{'debug'}) {
+       debug("Amanda::Recovery::Planner: $msg");
+    }
+}
+
+1;