sub extract_hints()

in lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm [515:735]


sub extract_hints {
  my ($self, $rawrule, $rule, $mods) = @_;

  my $main = $self->{main};
  my $orig = $rule;

  my $lossy = 0;
  $mods =~ s/L// and $lossy++;

  # if there are anchors, give up; we can't get much 
  # faster than these anyway
  die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;

  # die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
  # just remove end-of-string anchors; they're slow so could gain
  # from our speedup
  $rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;

  # simplify (?:..) to (..)
  $main->{bases_allow_noncapture_groups} or
            $rule =~ s/\(\?:/\(/g;

  # simplify some grouping arrangements so they're easier for us to parse
  # (foo)? => (foo|)
  $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
  # r? => (r|)
  $rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;

  # Create single tmpfile for extract_hints to use, instead of thousands
  if (!$self->{tmpf}) {
    ($self->{tmpf}, my $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
    $tmpfh  or die "failed to create a temporary file";
    close $tmpfh;
    $self->{tmpf} = untaint_var($self->{tmpf});
  }

  open(my $tmpfh, '>'.$self->{tmpf})
    or die "error opening $self->{tmpf}: $!";
  binmode $tmpfh;
  print $tmpfh "use bytes; m{" . $rule . "}" . $mods
    or die "error writing to $self->{tmpf}: $!";
  close $tmpfh  or die "error closing $self->{tmpf}: $!";

  $self->{perl} = $self->get_perl()  if !exists $self->{perl};
  local *IN;
  open (IN, "$self->{perl} -c -Mre=debug $self->{tmpf} 2>&1 |")
    or die "cannot run $self->{perl}: ".exit_status_str($?,$!);

  my($inbuf,$nread,$fullstr); $fullstr = '';
  while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
  defined $nread  or die "error reading from pipe: $!";

  close IN      or die "error closing pipe: $!";
  defined $fullstr  or warn "empty result from a pipe";

  # now parse the -Mre=debug output.
  # perl 5.10 format
  $fullstr =~ s/^.*\nFinal program:\n//gs;
  # perl 5.6/5.8 format
  $fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
  $fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
  # common to all
  $fullstr =~ s/\nOffsets:.*$//gs;

  # clean up every other line that doesn't start with a space
  $fullstr =~ s/^\S.*$//gm;

  if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
    die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
  }
  my $opsstr = $1;

  # what's left looks like this:
  #    1: EXACTF <v>(3)
  #    3: ANYOF[1ILil](14)
  #   14: EXACTF <a>(16)
  #   16: CURLY {2,7}(29)
  #   18:   ANYOF[A-Za-z](0)
  #   29: SPACE(30)
  #   30: EXACTF <http://>(33)
  #   33: END(0)
  #
  DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";

  my @ops;
  foreach my $op (split(/\n/s, $opsstr)) {
    next unless $op;

    if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {
      # perl 5.8:              <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
      # perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
      push @ops, [ $1, $2, $3 ];
    }
    elsif ($op =~ /^      (\s*)<(.*)>\.\.\.\s*$/) {
      #    5:   TRIE-EXACT[im](44)
      #         <message contained attachments that have been blocked by guin>...
      my $spcs = $1;
      # we could use the entire length here, but it's easier to trim to
      # the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
      # suite results will match, since the sa-update --list extraction will
      # be the same for all versions.  (The "..." trailer is important btw)
      my $str = substr ($2, 0, 55);
      push @ops, [ $spcs, '_moretrie', "<$str...>" ];
    }
    elsif ($op =~ /^      (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
      #    5:   TRIE-EXACT[am](21)
      #         <am> (21)
      #         <might> (12)
      push @ops, [ $1, '_moretrie', $2 ];
    }
    elsif ($op =~ /^ at .+ line \d+$/) {
      next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109': 
    }
    else {
      warn "cannot parse '$op': $opsstr";
      next;
    }
  }

  # unroll the branches; returns a list of versions.
  # e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
  my @unrolled;
  if ($main->{bases_split_out_alternations}) {
    @unrolled = $self->unroll_branches(0, \@ops);
  } else {
    @unrolled = ( \@ops );
  }

  # now find the longest DFA-friendly string in each unrolled version
  my @longests;
  foreach my $opsarray (@unrolled) {
    my $longestexact = '';
    my $buf = '';

    # use a closure to keep the code succinct
    my $add_candidate = sub {
      if (length $buf > length $longestexact) { $longestexact = $buf; }
      $buf = '';
    };

    my $prevop;
    foreach my $op (@{$opsarray}) {
      my ($spcs, $item, $args) = @{$op};

      next if ($item eq 'NOTHING');

      # EXACT == case-sensitive
      # EXACTF == case-i
      # we can do both, since we canonicalize to lc.
      if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
      {
        my $str = $1;
        $buf .= $str;
        if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
          # a high Unicode codepoint, interpreted by perl 5.8.x.  cut and stop
          $add_candidate->();
        }
        if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
          # perl 5.8.x truncates with a "..." here!  cut and stop
          $add_candidate->();
        }
      }
      # _moretrie == a TRIE-EXACT entry
      elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
      {
        $buf .= $1;
        if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
          # perl 5.8.x truncates with a "..." here!  cut and stop
          $add_candidate->();
        }
      }
      # /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
      elsif ($item =~ /^EXACT/ &&
          $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
                    $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
          $args =~ /<(.*)>/)
      {
        $buf .= $1;
        if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
          # perl 5.8.x truncates with a "..." here!  cut and stop
          $add_candidate->();
        }
      }
      # CURLYX, for perl >= 5.9.5
      elsif ($item =~ /^_moretrie/ &&
          $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
                    $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
          $args =~ /<(.*)>/)
      {
        $buf .= $1;
        if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
          # perl 5.8.x truncates with a "..." here!  cut and stop
          $add_candidate->();
        }
      }
      else {
        # not an /^EXACT/; clear the buffer
        $add_candidate->();
        if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)
        {
          $lossy = 1;
          DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
        }
      }
      $prevop = $op;
    }
    $add_candidate->();

    if (!$longestexact) {
      die "no long-enough string found in $rawrule\n";
      # all unrolled versions must have a long string, otherwise
      # we cannot reliably match all variants of the rule
    } else {
      push @longests, ($main->{bases_must_be_casei}) ?
                            lc $longestexact : $longestexact;
    }
  }

  DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
  return ($lossy, @longests);
}