sub restore_database()

in lib/Mail/SpamAssassin/BayesStore/DBM.pm [1625:1877]


sub restore_database {
  my ($self, $filename, $showdots) = @_;

  local *DUMPFILE;
  if (!open(DUMPFILE, '<', $filename)) {
    dbg("bayes: unable to open backup file $filename: $!");
    return 0;
  }
   
  if (!$self->tie_db_writable()) {
    dbg("bayes: failed to tie db writable");
    return 0;
  }

  my $main = $self->{bayes}->{main};
  my $path = $main->sed_path($main->{conf}->{bayes_path});

  # use a temporary PID-based suffix just in case another one was
  # created previously by an interrupted expire
  my $tmpsuffix = "convert$$";
  my $tmptoksdbname = $path.'_toks.'.$tmpsuffix;
  my $tmpseendbname = $path.'_seen.'.$tmpsuffix;
  my $toksdbname = $path.'_toks';
  my $seendbname = $path.'_seen';

  my %new_toks;
  my %new_seen;
  my $umask = umask 0;
  unless (tie %new_toks, $self->DBM_MODULE, $tmptoksdbname, O_RDWR|O_CREAT|O_EXCL,
	  (oct ($main->{conf}->{bayes_file_mode}) & 0666)) {
    dbg("bayes: failed to tie temp toks db: $!");
    $self->untie_db();
    umask $umask;
    return 0;
  }
  unless (tie %new_seen, $self->DBM_MODULE, $tmpseendbname, O_RDWR|O_CREAT|O_EXCL,
	  (oct ($main->{conf}->{bayes_file_mode}) & 0666)) {
    dbg("bayes: failed to tie temp seen db: $!");
    untie %new_toks;
    $self->_unlink_file($tmptoksdbname);
    $self->untie_db();
    umask $umask;
    return 0;
  }
  umask $umask;

  my $line_count = 0;
  my $db_version;
  my $token_count = 0;
  my $num_spam;
  my $num_ham;
  my $error_p = 0;
  my $newest_token_age = 0;
  # Kinda weird I know, but we need a nice big value and we know there will be
  # no tokens > time() since we reset atime if > time(), so use that with a
  # little buffer just in case.
  my $oldest_token_age = time() + 100000;

  my $line = <DUMPFILE>;
  defined $line  or die "Error reading dump file: $!";
  $line_count++;

  # We require the database version line to be the first in the file so we can
  # figure out how to properly deal with the file.  If it is not the first
  # line then fail
  if ($line =~ m/^v\s+(\d+)\s+db_version/) {
    $db_version = $1;
  }
  else {
    dbg("bayes: database version must be the first line in the backup file, correct and re-run");
    untie %new_toks;
    untie %new_seen;
    $self->_unlink_file($tmptoksdbname);
    $self->_unlink_file($tmpseendbname);
    $self->untie_db();
    return 0;
  }

  unless ($db_version == 2 || $db_version == 3) {
    warn("bayes: database version $db_version is unsupported, must be version 2 or 3");
    untie %new_toks;
    untie %new_seen;
    $self->_unlink_file($tmptoksdbname);
    $self->_unlink_file($tmpseendbname);
    $self->untie_db();
    return 0;
  }

  for ($!=0; defined($line=<DUMPFILE>); $!=0) {
    chomp($line);
    $line_count++;

    if ($line_count % 1000 == 0) {
      print STDERR "." if ($showdots);
    }

    if ($line =~ /^v\s+/) { # variable line
      my @parsed_line = split(/\s+/, $line, 3);
      my $value = $parsed_line[1] + 0;
      if ($parsed_line[2] eq 'num_spam') {
	$num_spam = $value;
      }
      elsif ($parsed_line[2] eq 'num_nonspam') {
	$num_ham = $value;
      }
      else {
	dbg("bayes: restore_database: skipping unknown line: $line");
      }
    }
    elsif ($line =~ /^t\s+/) { # token line
      my @parsed_line = split(/\s+/, $line, 5);
      my $spam_count = $parsed_line[1] + 0;
      my $ham_count = $parsed_line[2] + 0;
      my $atime = $parsed_line[3] + 0;
      my $token = $parsed_line[4];

      my $token_warn_p = 0;
      my @warnings;

      if ($spam_count < 0) {
	$spam_count = 0;
	push(@warnings, 'spam count < 0, resetting');
	$token_warn_p = 1;
      }
      if ($ham_count < 0) {
	$ham_count = 0;
	push(@warnings, 'ham count < 0, resetting');
	$token_warn_p = 1;
      }

      if ($spam_count == 0 && $ham_count == 0) {
	dbg("bayes: token has zero spam and ham count, skipping");
	next;
      }

      if ($atime > time()) {
	$atime = time();
	push(@warnings, 'atime > current time, resetting');
	$token_warn_p = 1;
      }

      if ($token_warn_p) {
	dbg("bayes: token (%s) has the following warnings:\n%s",
            $token, join("\n",@warnings));
      }

      # database versions < 3 did not encode their token values
      if ($db_version < 3) {
	$token = substr(sha1($token), -5);
      }
      else {
	# turn unpacked binary token back into binary value
	$token = pack("H*",$token);
      }

      $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime);
      if ($atime < $oldest_token_age) {
	$oldest_token_age = $atime;
      }
      if ($atime > $newest_token_age) {
	$newest_token_age = $atime;
      }
      $token_count++;
    }
    elsif ($line =~ /^s\s+/) { # seen line
      my @parsed_line = split(/\s+/, $line, 3);
      my $flag = $parsed_line[1];
      my $msgid = $parsed_line[2];

      unless ($flag eq 'h' || $flag eq 's') {
	dbg("bayes: unknown seen flag ($flag) for line: $line, skipping");
	next;
      }

      unless ($msgid) {
	dbg("bayes: blank msgid for line: $line, skipping");
	next;
      }

      $new_seen{$msgid} = $flag;
    }
    else {
      dbg("bayes: skipping unknown line: $line");
      next;
    }
  }
  defined $line || $!==0  or die "Error reading dump file: $!";
  close(DUMPFILE) or die "Can't close dump file: $!";

  print STDERR "\n" if ($showdots);

  unless (defined($num_spam)) {
    dbg("bayes: unable to find num spam, please check file");
    $error_p = 1;
  }

  unless (defined($num_ham)) {
    dbg("bayes: unable to find num ham, please check file");
    $error_p = 1;
  }

  if ($error_p) {
    dbg("bayes: error(s) while attempting to load $filename, correct and re-run");

    untie %new_toks;
    untie %new_seen;
    $self->_unlink_file($tmptoksdbname);
    $self->_unlink_file($tmpseendbname);
    $self->untie_db();
    return 0;
  }

  # set the calculated magic tokens
  $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION();
  $new_toks{$NTOKENS_MAGIC_TOKEN} = $token_count;
  $new_toks{$NSPAM_MAGIC_TOKEN} = $num_spam;
  $new_toks{$NHAM_MAGIC_TOKEN} = $num_ham;
  $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newest_token_age;
  $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest_token_age;

  # go ahead and zero out these, chances are good that they are bogus anyway.
  $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = 0;
  $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = 0;
  $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
  $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;

  local $SIG{'INT'} = 'IGNORE';
  local $SIG{'TERM'} = 'IGNORE';
  local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();

  untie %new_toks;
  untie %new_seen;
  $self->untie_db();

  # Here is where something can go horribly wrong and screw up the bayes
  # database files.  If we are able to copy one and not the other then it
  # will leave the database in an inconsistent state.  Since this is an
  # edge case, and they're trying to replace the DB anyway we should be ok.
  unless ($self->_rename_file($tmptoksdbname, $toksdbname)) {
    dbg("bayes: error while renaming $tmptoksdbname to $toksdbname: $!");
    return 0;
  }
  unless ($self->_rename_file($tmpseendbname, $seendbname)) {
    dbg("bayes: error while renaming $tmpseendbname to $seendbname: $!");
    dbg("bayes: database now in inconsistent state");
    return 0;
  }

  dbg("bayes: parsed $line_count lines");
  dbg("bayes: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages");

  return 1;
}