in lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm [515:735]
sub extract_hints {
my ($self, $rawrule, $rule, $mods) = @_;
my $main = $self->{main};
my $orig = $rule;
my $lossy = 0;
$mods =~ s/L// and $lossy++;
# if there are anchors, give up; we can't get much
# faster than these anyway
die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;
# die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
# just remove end-of-string anchors; they're slow so could gain
# from our speedup
$rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;
# simplify (?:..) to (..)
$main->{bases_allow_noncapture_groups} or
$rule =~ s/\(\?:/\(/g;
# simplify some grouping arrangements so they're easier for us to parse
# (foo)? => (foo|)
$rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
# r? => (r|)
$rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;
# Create single tmpfile for extract_hints to use, instead of thousands
if (!$self->{tmpf}) {
($self->{tmpf}, my $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
$tmpfh or die "failed to create a temporary file";
close $tmpfh;
$self->{tmpf} = untaint_var($self->{tmpf});
}
open(my $tmpfh, '>'.$self->{tmpf})
or die "error opening $self->{tmpf}: $!";
binmode $tmpfh;
print $tmpfh "use bytes; m{" . $rule . "}" . $mods
or die "error writing to $self->{tmpf}: $!";
close $tmpfh or die "error closing $self->{tmpf}: $!";
$self->{perl} = $self->get_perl() if !exists $self->{perl};
local *IN;
open (IN, "$self->{perl} -c -Mre=debug $self->{tmpf} 2>&1 |")
or die "cannot run $self->{perl}: ".exit_status_str($?,$!);
my($inbuf,$nread,$fullstr); $fullstr = '';
while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
defined $nread or die "error reading from pipe: $!";
close IN or die "error closing pipe: $!";
defined $fullstr or warn "empty result from a pipe";
# now parse the -Mre=debug output.
# perl 5.10 format
$fullstr =~ s/^.*\nFinal program:\n//gs;
# perl 5.6/5.8 format
$fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
$fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
# common to all
$fullstr =~ s/\nOffsets:.*$//gs;
# clean up every other line that doesn't start with a space
$fullstr =~ s/^\S.*$//gm;
if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
}
my $opsstr = $1;
# what's left looks like this:
# 1: EXACTF <v>(3)
# 3: ANYOF[1ILil](14)
# 14: EXACTF <a>(16)
# 16: CURLY {2,7}(29)
# 18: ANYOF[A-Za-z](0)
# 29: SPACE(30)
# 30: EXACTF <http://>(33)
# 33: END(0)
#
DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
my @ops;
foreach my $op (split(/\n/s, $opsstr)) {
next unless $op;
if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {
# perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
# perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
push @ops, [ $1, $2, $3 ];
}
elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) {
# 5: TRIE-EXACT[im](44)
# <message contained attachments that have been blocked by guin>...
my $spcs = $1;
# we could use the entire length here, but it's easier to trim to
# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
# suite results will match, since the sa-update --list extraction will
# be the same for all versions. (The "..." trailer is important btw)
my $str = substr ($2, 0, 55);
push @ops, [ $spcs, '_moretrie', "<$str...>" ];
}
elsif ($op =~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
# 5: TRIE-EXACT[am](21)
# <am> (21)
# <might> (12)
push @ops, [ $1, '_moretrie', $2 ];
}
elsif ($op =~ /^ at .+ line \d+$/) {
next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
}
else {
warn "cannot parse '$op': $opsstr";
next;
}
}
# unroll the branches; returns a list of versions.
# e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
my @unrolled;
if ($main->{bases_split_out_alternations}) {
@unrolled = $self->unroll_branches(0, \@ops);
} else {
@unrolled = ( \@ops );
}
# now find the longest DFA-friendly string in each unrolled version
my @longests;
foreach my $opsarray (@unrolled) {
my $longestexact = '';
my $buf = '';
# use a closure to keep the code succinct
my $add_candidate = sub {
if (length $buf > length $longestexact) { $longestexact = $buf; }
$buf = '';
};
my $prevop;
foreach my $op (@{$opsarray}) {
my ($spcs, $item, $args) = @{$op};
next if ($item eq 'NOTHING');
# EXACT == case-sensitive
# EXACTF == case-i
# we can do both, since we canonicalize to lc.
if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
{
my $str = $1;
$buf .= $str;
if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
# a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
$add_candidate->();
}
if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
# _moretrie == a TRIE-EXACT entry
elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
# /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
elsif ($item =~ /^EXACT/ &&
$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
$args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
# CURLYX, for perl >= 5.9.5
elsif ($item =~ /^_moretrie/ &&
$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
$args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
else {
# not an /^EXACT/; clear the buffer
$add_candidate->();
if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)
{
$lossy = 1;
DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
}
}
$prevop = $op;
}
$add_candidate->();
if (!$longestexact) {
die "no long-enough string found in $rawrule\n";
# all unrolled versions must have a long string, otherwise
# we cannot reliably match all variants of the rule
} else {
push @longests, ($main->{bases_must_be_casei}) ?
lc $longestexact : $longestexact;
}
}
DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
return ($lossy, @longests);
}