git.gag.com Git - debian/amanda/blob - perl/Amanda/DB/Catalog.pm

   1 # Copyright (c) 2006 Zmanda Inc.  All Rights Reserved.
   2 #
   3 # This program is free software; you can redistribute it and/or modify it
   4 # under the terms of the GNU General Public License version 2 as published
   5 # by the Free Software Foundation.
   6 #
   7 # This program is distributed in the hope that it will be useful, but
   8 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   9 # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  10 # for more details.
  11 #
  12 # You should have received a copy of the GNU General Public License along
  13 # with this program; if not, write to the Free Software Foundation, Inc.,
  14 # 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  15 #
  16 # Contact information: Zmanda Inc, 505 N Mathlida Ave, Suite 120
  17 # Sunnyvale, CA 94085, USA, or: http://www.zmanda.com
  18
  19 package Amanda::DB::Catalog;
  20
  21 =head1 NAME
  22
  23 Amanda::DB::Catalog - access to the Amanda catalog: where is that dump?
  24
  25 =head1 SYNOPSIS
  26
  27   use Amanda::DB::Catalog;
  28
  29   # get all dump timestamps on record
  30   my @timestamps = Amanda::DB::Catalog::get_timestamps();
  31
  32   # loop over those timestamps, printing dump info for each one
  33   for my $timestamp (@timestamps) {
  34       my @dumpfiles = Amanda::DB::Catalog::get_dumps(
  35           timestamp => $timestamp,
  36           ok => 1
  37       );
  38       print "$timstamp:\n";
  39       for my $dumpfile (@dumpfiles) {
  40           print " ", $dumpfile->{hostname}, ":", $dumpfile->{diskname},
  41                 " level ", $dumpfile->{level}, "\n";
  42       }
  43   }
  44
  45 =head1 DESCRIPTION
  46
  47 =head2 MODEL
  48
  49 The Amanda catalog is a set of dumpfiles, where each dumpfile corresponds to a
  50 single file in a storage volume.  On tapes, files are separated by filemarks
  51 and numbered sequentially.  This model is preserved on non-tape media such as
  52 the VFS and S3 devices.  A dumpfile, then, is completely specified by a volume
  53 label and a file number (I<filenum>).
  54
  55 The catalog is presented as a single table containing one row per dumpfile.
  56 Each row has the following values:
  57
  58 =over
  59
  60 =item label
  61
  62 (string) -- volume label
  63
  64 =item filenum
  65
  66 (integer) -- file on that volume
  67
  68 =item dump_timestamp
  69
  70 (string) -- timestamp of the run in which the dump was created
  71
  72 =item write_timestamp
  73
  74 (string) -- timestamp of the run in which the dump was written to this volume
  75
  76 =item hostname
  77
  78 (string) -- dump hostname
  79
  80 =item diskname
  81
  82 (string) -- dump diskname
  83
  84 =item level
  85
  86 (integer) -- dump level
  87
  88 =item status
  89
  90 (string) -- "OK", "PARTIAL" or some other descriptor
  91
  92 =item partnum
  93
  94 (integer) -- part number of a split dump (1-based)
  95
  96 =item nparts
  97
  98 (integer) -- number of parts in this dump (estimated)
  99
 100 =item kb
 101
 102 (integer) -- size (in kb) of this dumpfile
 103
 104 =item sec
 105
 106 (integer) -- time (in seconds) spent writing this dumpfile
 107
 108 =back
 109
 110 A dumpfile is represented as a hashref with these keys.
 111
 112 The label and filenum serve as a primary key.  The dump_timestamp, hostname,
 113 diskname, and level uniquely identify the dump.  The write_timestamp gives the
 114 time that the dump was written to this volume.  The write_timestamp may differ
 115 from the dump_timestamp if, for example, I<amflush> wrote the dump to tape
 116 after the initial dump.  The remaining fields are informational.
 117
 118 =head2 NOTES
 119
 120 A dumpfile may be a part of a larger (split) dump, or may be partial (due to
 121 end of tape or some other error), so the contents of the catalog require some
 122 interpretation in order to find a particular dump.
 123
 124 All timestamps used in this module are full-length, in the format
 125 C<YYYYMMDDHHMMSS>.  If the underlying data contains only datestamps, they are
 126 zero-extended into timestamps: C<YYYYMMDD000000>.  A dump_timestamp always
 127 corresponds to the initiation of the I<original> dump run, while
 128 write_timestamp gives the time the file was written to the volume.  When
 129 dumpfiles are migrated from volume to volume (e.g., by I<amflush>), the
 130 dump_timestamp does not change.
 131
 132 In Amanda, the tuple (hostname, diskname, level, dump_timestamp) serves as a unique
 133 identifier for a dump.  Since all of this information is preserved during
 134 migrations, a catalog query with these four terms will return all dumpfiles
 135 relevant to that dump.
 136
 137 =head2 QUERIES
 138
 139 This API is read-only at the moment.  The following functions are available:
 140
 141 =over
 142
 143 =item get_write_timestamps()
 144
 145 Get a list of all write timestamps, sorted in chronological order.
 146
 147 =item get_latest_write_timestamp()
 148
 149 Return the most recent write timestamp.
 150
 151 =item get_labels_written_at_timestamp($ts)
 152
 153 Return a list of labels for volumes written at the given timestamp.
 154
 155 =item get_dumps(%parameters)
 156
 157 This function is the workhorse query interface, and returns a sequence of
 158 dumpfiles.  Values in C<%parameters> restrict the set of dumpfiles that are
 159 returned.  The hash can have any of the following keys:
 160
 161 =over
 162
 163 =item write_timestamp
 164
 165 restrict to dumpfiles written at this timestamp
 166
 167 =item write_timestamps
 168
 169 (arrayref) restrict to dumpfiles written at any of these timestamps
 170
 171 =item dump_timestamp
 172
 173 restrict to dumpfiles with exactly this timestamp
 174
 175 =item dump_timestamps
 176
 177 (arrayref) restrict to dumpfiles with any of these timestamps
 178
 179 =item dump_timestamp_match
 180
 181 restrict to dumpfiles with timestamps matching this expression
 182
 183 =item hostname
 184
 185 restrict to dumpfiles with exactly this hostname
 186
 187 =item hostnames
 188
 189 (arrayref) restrict to dumpfiles with any of these hostnames
 190
 191 =item hostname_match
 192
 193 restrict to dumpfiles with hostnames matching this expression
 194
 195 =item diskname
 196
 197 restrict to dumpfiles with exactly this diskname
 198
 199 =item disknames
 200
 201 (arrayref) restrict to dumpfiles with any of these disknames
 202
 203 =item diskname_match
 204
 205 restrict to dumpfiles with disknames matching this expression
 206
 207 =item label
 208
 209 restrict to dumpfiles with exactly this label
 210
 211 =item labels
 212
 213 (arrayref) restrict to dumpfiles with any of these labels
 214
 215 =item level
 216
 217 restrict to dumpfiles with exactly this level
 218
 219 =item levels
 220
 221 (arrayref) restrict to dumpfiles with any of these levels
 222
 223 =item status
 224
 225 restrict to dumpfiles with this status
 226
 227 =back
 228
 229 Match expressions are described in the amanda(8) manual page.
 230
 231 =item sort_dumps([ $key1, $key2, .. ], @dumps)
 232
 233 Given a list of dumps, this function sorts that list by the requested keys.
 234 The following keys are available:
 235
 236 =over
 237
 238 =item hostname
 239
 240 =item diskname
 241
 242 =item write_timestamp
 243
 244 =item dump_timestamp
 245
 246 =item level
 247
 248 =item filenum
 249
 250 =item label
 251
 252 =item partnum
 253
 254 =item kb
 255
 256 =item sec
 257
 258 =back
 259
 260 Keys are processed from left to right: if two dumps have the same value for
 261 C<$key1>, then C<$key2> is examined, and so on.  Key names may be prefixed by
 262 "C<->" to reverse the order.
 263
 264 =item add_dump($dumpfile)
 265
 266 Add the given dumpfile to the database.  In terms of logfiles, this will either
 267 create a new logfile (if the dump's C<write_timestamp> has not been seen
 268 before) or append to an existing logfile.  Note that a new logfile will require
 269 a corresponding new entry in the tapelist.
 270
 271 Note that no locking is performed: multiple simultaneous calls to this function
 272 can result in a corrupted or incorrect logfile.
 273
 274 =back
 275
 276 =head1 API STATUS
 277
 278 New summary functions may be added to reduce code duplication in other parts of
 279 Amanda.
 280
 281 Support for loading and modifying the tapelist may eventually be folded into
 282 this module.
 283
 284 =cut
 285
 286 use Amanda::Logfile;
 287 use Amanda::Tapelist;
 288 use Amanda::Config qw( :init :getconf config_dir_relative );
 289 use Amanda::Util qw( quote_string );
 290 use warnings;
 291 use strict;
 292
 293 # tapelist cache
 294 my $tapelist = undef;
 295 my $tapelist_filename = undef;
 296
 297 # utility function
 298 sub zeropad {
 299     my ($timestamp) = @_;
 300     if (length($timestamp) == 8) {
 301         return $timestamp."000000";
 302     }
 303     return $timestamp;
 304 }
 305
 306 sub get_write_timestamps {
 307     my @rv;
 308
 309     # find_log assumes that the tapelist has been loaded, so load it now
 310     _load_tapelist();
 311
 312     for (Amanda::Logfile::find_log()) {
 313         next unless (my ($timestamp) = /^log\.([0-9]+)(?:\.[0-9]+|\.amflush)?$/);
 314         push @rv, zeropad($timestamp);
 315     }
 316
 317     return sort @rv;
 318 }
 319
 320 sub get_latest_write_timestamp {
 321     # get all of the timestamps and select the last one
 322     my @timestamps = get_write_timestamps();
 323
 324     if (@timestamps) {
 325         return $timestamps[-1];
 326     }
 327
 328     return undef;
 329 }
 330
 331 sub get_dumps {
 332     my %params = @_;
 333     my $logfile_dir = config_dir_relative(getconf($CNF_LOGDIR));
 334
 335     # find_log assumes that the tapelist has been loaded, so load it now
 336     _load_tapelist();
 337
 338     # pre-process params by appending all of the "singular" parameters to the "plurals"
 339     push @{$params{'write_timestamps'}}, map { zeropad($_) } $params{'write_timestamp'}
 340         if exists($params{'write_timestamp'});
 341     push @{$params{'dump_timestamps'}}, map { zeropad($_) } $params{'dump_timestamp'}
 342         if exists($params{'dump_timestamp'});
 343     push @{$params{'hostnames'}}, $params{'hostname'}
 344         if exists($params{'hostname'});
 345     push @{$params{'disknames'}}, $params{'diskname'}
 346         if exists($params{'diskname'});
 347     push @{$params{'levels'}}, $params{'level'}
 348         if exists($params{'level'});
 349     push @{$params{'labels'}}, $params{'label'}
 350         if exists($params{'label'});
 351
 352     # Since we're working from logfiles, we have to pick the logfiles we'll use first.
 353     # Then we can use search_logfile.
 354     my @logfiles;
 355     if (exists($params{'write_timestamps'})) {
 356         # if we have specific write_timestamps, the job is pretty easy.
 357         my %timestamps_hash = map { ($_, undef) } @{$params{'write_timestamps'}};
 358         for my $logfile (Amanda::Logfile::find_log()) {
 359             next unless (my ($timestamp) = $logfile =~ /^log\.([0-9]+)(?:\.[0-9]+|\.amflush)?$/);
 360             next unless (exists($timestamps_hash{zeropad($timestamp)}));
 361             push @logfiles, $logfile;
 362         }
 363     } elsif (exists($params{'dump_timestamps'})) {
 364         # otherwise, we need only look in logfiles at or after the earliest dump timestamp
 365         my @sorted_timestamps = sort @{$params{'dump_timestamps'}};
 366         my $earliest_timestamp = $sorted_timestamps[0];
 367         for my $logfile (Amanda::Logfile::find_log()) {
 368             next unless (my ($timestamp) = $logfile =~ /^log\.([0-9]+)(?:\.[0-9]+|\.amflush)?$/);
 369             next unless (zeropad($timestamp) ge $earliest_timestamp);
 370             push @logfiles, $logfile;
 371         }
 372     } else {
 373         # oh well -- it looks like we'll have to read all existing logfiles.
 374         @logfiles = Amanda::Logfile::find_log();
 375     }
 376
 377     # Set up some hash tables for speedy lookups of various attributes
 378     my (%dump_timestamps_hash, %hostnames_hash, %disknames_hash, %levels_hash, %labels_hash);
 379     %dump_timestamps_hash = map { ($_, undef) } @{$params{'dump_timestamps'}}
 380         if (exists($params{'dump_timestamps'}));
 381     %hostnames_hash = map { ($_, undef) } @{$params{'hostnames'}}
 382         if (exists($params{'hostnames'}));
 383     %disknames_hash = map { ($_, undef) } @{$params{'disknames'}}
 384         if (exists($params{'disknames'}));
 385     %levels_hash = map { ($_, undef) } @{$params{'levels'}}
 386         if (exists($params{'levels'}));
 387     %labels_hash = map { ($_, undef) } @{$params{'labels'}}
 388         if (exists($params{'labels'}));
 389
 390     # now loop over those logfiles and use search_logfile to load the dumpfiles
 391     # from them, then process each entry from the logfile
 392     my @results;
 393     for my $logfile (@logfiles) {
 394         # get the raw contents from search_logfile
 395         my @find_results = Amanda::Logfile::search_logfile(undef, undef,
 396                                                     "$logfile_dir/$logfile", 1);
 397
 398         # filter against *_match with dumps_match
 399         @find_results = Amanda::Logfile::dumps_match([@find_results],
 400             exists($params{'hostname_match'})? $params{'hostname_match'} : undef,
 401             exists($params{'diskname_match'})? $params{'diskname_match'} : undef,
 402             exists($params{'dump_timestamp_match'})? $params{'dump_timestamp_match'} : undef,
 403             undef,
 404             0);
 405
 406         # convert to dumpfile hashes, including the write_timestamp from the logfile name
 407         my ($timestamp) = $logfile =~ /^log\.([0-9]+)(?:\.[0-9]+|\.amflush)?$/;
 408         my $write_timestamp = zeropad($timestamp);
 409
 410         # loop over each entry in the logfile.
 411         for my $find_result (@find_results) {
 412
 413             # filter out the non-dump error messages that find.c produces
 414             next unless (defined $find_result->{'label'});
 415
 416             # bail out on this result early, if possible
 417             next if (%dump_timestamps_hash
 418                 and !exists($dump_timestamps_hash{zeropad($find_result->{'timestamp'})}));
 419             next if (%hostnames_hash
 420                 and !exists($hostnames_hash{$find_result->{'hostname'}}));
 421             next if (%disknames_hash
 422                 and !exists($disknames_hash{$find_result->{'diskname'}}));
 423             next if (%levels_hash
 424                 and !exists($levels_hash{$find_result->{'level'}}));
 425             next if (%labels_hash
 426                 and !exists($labels_hash{$find_result->{'label'}}));
 427             next if (exists($params{'status'})
 428                 and $find_result->{'status'} ne $params{'status'});
 429
 430             # start setting up a dumpfile hash for this result
 431             my %dumpfile = (
 432                 'write_timestamp' => $write_timestamp,
 433                 'dump_timestamp' => zeropad($find_result->{'timestamp'}),
 434                 'hostname' => $find_result->{'hostname'},
 435                 'diskname' => $find_result->{'diskname'},
 436                 'level' => $find_result->{'level'},
 437                 'label' => $find_result->{'label'},
 438                 'filenum' => $find_result->{'filenum'},
 439                 'status' => $find_result->{'status'},
 440                 'sec' => $find_result->{'sec'},
 441                 'kb' => $find_result->{'kb'},
 442             );
 443
 444             # partnum and nparts takes some special interpretation
 445             if (my ($partnum, $nparts) = $find_result->{'partnum'} =~ m$(\d+)/(-?\d+)$) {
 446                 $dumpfile{'partnum'} = $partnum+0;
 447                 $dumpfile{'nparts'} = $nparts+0;
 448             } else {
 449                 $dumpfile{'partnum'} = 1;
 450                 $dumpfile{'nparts'} = 1;
 451             }
 452
 453             # check partnum and nparts
 454             next if (defined($params{'partnum'}) and $dumpfile{'partnum'} != $params{'partnum'});
 455             next if (defined($params{'nparts'}) and $dumpfile{'nparts'} != $params{'nparts'});
 456
 457             push @results, \%dumpfile;
 458         }
 459     }
 460
 461     return @results;
 462 }
 463
 464 sub sort_dumps {
 465     my ($keys, @dumps) = @_;
 466
 467     return sort {
 468         my $r;
 469         for my $key (@$keys) {
 470             if ($key =~ /^-(.*)$/) {
 471                 $r = $b->{$1} cmp $a->{$1}; # note: $a and $b are reversed
 472             } else {
 473                 $r = $a->{$key} cmp $b->{$key};
 474             }
 475             return $r if $r;
 476         }
 477         return 0;
 478     } @dumps;
 479 }
 480
 481 # caches for add_dump() to avoid repeatedly looking up the log
 482 # filename for a particular write_timestamp.
 483 my $add_dump_last_label = undef;
 484 my $add_dump_last_write_timestamp = undef;
 485 my $add_dump_last_logfile = undef;
 486
 487 sub add_dump {
 488     my ($dump) = @_;
 489     my $found;
 490     my $logfh;
 491     my $logfile;
 492     my $find_result;
 493     my $logdir = getconf($CNF_LOGDIR);
 494     my ($last_filenum, $last_secs, $last_kbs);
 495
 496     # first order of business is to find out whether we need to make a new
 497     # dumpfile for this.
 498     my $write_timestamp = zeropad($dump->{'write_timestamp'});
 499     die "dump has no 'write_timestamp'" unless defined $write_timestamp;
 500
 501     # consult our one-element cache for this label and write_timestamp
 502     if (!defined $add_dump_last_label
 503         or $add_dump_last_label ne $dump->{'label'}
 504         or $add_dump_last_write_timestamp ne $dump->{'write_timestamp'}) {
 505
 506         # update the cache
 507         $add_dump_last_logfile = undef;
 508         LOGFILE:
 509         for my $lf (Amanda::Logfile::find_log()) {
 510             next unless (my ($log_timestamp) = $lf =~ /^log\.([0-9]+)(?:\.[0-9]+|\.amflush)?$/);
 511             next unless (zeropad($log_timestamp) eq $write_timestamp);
 512
 513             # write timestamp matches; now check the label
 514             LOGFILE_DUMP:
 515             for $find_result (Amanda::Logfile::search_logfile(undef, undef,
 516                                         "$logdir/$lf", 1)) {
 517                 next unless (defined $find_result->{'label'});
 518
 519                 if ($find_result->{'label'} eq $dump->{'label'}) {
 520                     $add_dump_last_label = $dump->{'label'};
 521                     $add_dump_last_write_timestamp = $dump->{'write_timestamp'};
 522                     $add_dump_last_logfile = $lf;
 523                     last LOGFILE;
 524                 }
 525             }
 526         }
 527     }
 528     $logfile = $add_dump_last_logfile;
 529
 530     # truncate the write_timestamp if we're not using timestamps
 531     if (!getconf($CNF_USETIMESTAMPS)) {
 532         $write_timestamp = substr($write_timestamp, 0, 8);
 533     }
 534
 535     # get the information on the last dump and part in this logfile, or create
 536     # a new logfile if none exists, then open the logfile for writing.
 537     if (defined $logfile) {
 538         $last_filenum = -1;
 539
 540         # NOTE: this depends on an implementation detail of search_logfile: it
 541         # returns the results in the reverse order of appearance in the logfile.
 542         # Since we're concerned with the last elements of this logfile that we
 543         # will be appending to shortly, we simply reverse this list.  As this
 544         # package is rewritten to parse logfiles on its own (or access a relational
 545         # database), this implementation detail will no longer be relevant.
 546         my @find_results = reverse Amanda::Logfile::search_logfile(undef, undef,
 547                                                     "$logdir/$logfile", 1);
 548         for $find_result (@find_results) {
 549             # filter out the non-dump error messages that find.c produces
 550             next unless (defined $find_result->{'label'});
 551
 552             $last_filenum = $find_result->{'filenum'};
 553
 554             # if this is part number 1, reset our secs and kbs counters on the
 555             # assumption that this is the beginning of a new dump
 556             if ($find_result->{'partnum'} =~ qr{1/\d}) {
 557                 $last_secs = $last_kbs = 0;
 558             }
 559             $last_secs += $find_result->{'sec'};
 560             $last_kbs += $find_result->{'kb'};
 561         }
 562
 563         open($logfh, ">>", "$logdir/$logfile");
 564     } else {
 565         $last_filenum = -1;
 566         $last_secs = 0;
 567         $last_kbs = 0;
 568
 569         # pick an unused log filename
 570         my $i = 0;
 571         while (1) {
 572             $logfile = "log.$write_timestamp.$i";
 573             last unless -f "$logdir/$logfile";
 574             $i++;
 575         }
 576
 577         open($logfh, ">", "$logdir/$logfile")
 578             or die("Could not write '$logdir/$logfile': $!");
 579
 580         print $logfh
 581             "INFO taper This logfile was generated by Amanda::DB::Catalog\n";
 582
 583         print $logfh
 584             "START taper datestamp $write_timestamp label $dump->{label} tape $i\n";
 585
 586         if (!defined $tapelist_filename) {
 587             $tapelist_filename = config_dir_relative(getconf($CNF_TAPELIST));
 588         }
 589
 590         # reload the tapelist immediately, in case it's been modified
 591         $tapelist = Amanda::Tapelist::read_tapelist($tapelist_filename);
 592
 593         # see if we need to add an entry to the tapelist for this dump
 594         if (!grep { $_->{'label'} eq $dump->{'label'}
 595                     and zeropad($_->{'datestamp'}) eq zeropad($dump->{'write_timestamp'})
 596                 } @$tapelist) {
 597             $tapelist->add_tapelabel($write_timestamp, $dump->{'label'});
 598             $tapelist->write($tapelist_filename);
 599         }
 600     }
 601
 602     if ($last_filenum >= 0 && $last_filenum+1 != $dump->{'filenum'}) {
 603         warn "Discontinuity in filenums in $logfile: " .
 604              "from $last_filenum to $dump->{filenum}";
 605     }
 606
 607     my $kps = $dump->{'sec'}? (($dump->{'kb'} + 0.0) / $dump->{'sec'}) : 0.0;
 608
 609     my $part_line = "PART taper ";
 610     $part_line .= "$dump->{label} ";
 611     $part_line .= "$dump->{filenum} ";
 612     $part_line .= quote_string($dump->{hostname}) . " ";
 613     $part_line .= quote_string($dump->{diskname}) . " ";
 614     $part_line .= "$dump->{dump_timestamp} ";
 615     $part_line .= "$dump->{partnum}/$dump->{nparts} ";
 616     $part_line .= "$dump->{level} ";
 617     $part_line .= "[sec $dump->{sec} kb $dump->{kb} kps $kps]";
 618     print $logfh "$part_line\n";
 619
 620     # TODO: we don't always know nparts when writing a part, so
 621     # this is not always an effective way to detect a complete dump.
 622     # However, it works for purposes of data vaulting.
 623     if ($dump->{'partnum'} == $dump->{'nparts'}) {
 624         my $secs = $last_secs + $dump->{'sec'};
 625         my $kbs = $last_kbs + $dump->{'kb'};
 626         $kps = $secs? ($kbs + 0.0) / $secs : 0.0;
 627
 628         my $done_line = "DONE taper ";
 629         $done_line .= quote_string($dump->{hostname}) ." ";
 630         $done_line .= quote_string($dump->{diskname}) ." ";
 631         $done_line .= "$dump->{dump_timestamp} ";
 632         $done_line .= "$dump->{nparts} ";
 633         $done_line .= "$dump->{level} ";
 634         $done_line .= "[sec $secs kb $kbs kps $kps]";
 635         print $logfh "$done_line\n";
 636     }
 637
 638     close($logfh);
 639 }
 640
 641 sub _load_tapelist {
 642     if (!defined $tapelist) {
 643         $tapelist_filename = config_dir_relative(getconf($CNF_TAPELIST));
 644         $tapelist = Amanda::Tapelist::read_tapelist($tapelist_filename);
 645     }
 646 }
 647
 648 sub _clear_cache { # (used by installcheck)
 649     $tapelist = $tapelist_filename = undef;
 650 }
 651
 652 1;