in lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm [132:387]
sub extract_set_pri {
my ($self, $conf, $rules, $ruletype) = @_;
my @good_bases;
my @failed;
my $yes = 0;
my $no = 0;
my $count = 0;
my $start = time;
$self->{main} = $conf->{main}; # for use in extract_hints()
$self->{show_progress} and info ("extracting from rules of type $ruletype");
my $tflags = $conf->{tflags};
# attempt to find good "base strings" (simplified regexp subsets) for each
# regexp. We try looking at the regexp from both ends, since there
# may be a good long string of text at the end of the rule.
# require this many chars in a base string + delimiters for it to be viable
my $min_chars = 5;
my $progress;
$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
total => (scalar keys %{$rules} || 1),
itemtype => 'rules',
});
my $cached = { };
my $cachefile;
if ($self->{main}->{bases_cache_dir}) {
$cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
dbg("zoom: reading cache file $cachefile");
$cached = $self->read_cachefile($cachefile);
}
NEXT_RULE:
foreach my $name (keys %{$rules}) {
$self->{show_progress} and $progress and $progress->update(++$count);
#my $rule = $rules->{$name};
my $rule = qr_to_string($conf->{test_qrs}->{$name});
if (!defined $rule) {
die "zoom: error: regexp for $rule not found\n";
}
my $cachekey = $name.'#'.$rule;
my $cent = $cached->{rule_bases}->{$cachekey};
if (defined $cent) {
if (defined $cent->{g}) {
dbg("zoom: YES (cached) $rule $name");
foreach my $ent (@{$cent->{g}}) {
# note: we have to copy these, since otherwise later
# modifications corrupt the cached data
push @good_bases, {
base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
};
}
$yes++;
}
else {
dbg("zoom: NO (cached) $rule $name");
push @failed, { orig => $rule }; # no need to cache this
$no++;
}
next NEXT_RULE;
}
# ignore ReplaceTags rules, and regex capture template rules
my $is_a_replace_rule = $conf->{replace_rules}->{$name} ||
$conf->{capture_rules}->{$name} ||
$conf->{capture_template_rules}->{$name};
my ($minlen, $lossy, @bases);
if (!$is_a_replace_rule) {
eval { # catch die()s
my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
# dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
1;
} or do {
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
$eval_stat =~ s/ at .*//s;
dbg("zoom: giving up on regexp: $eval_stat");
};
#if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) {
# warn "\nzoom: $vers rule $name will loop on SpamAssassin older than 3.3.2 ".
# "running under Perl 5.12 or older, Bug 6558\n";
#}
# if any of the extracted hints in a set are too short, the entire
# set is invalid; this is because each set of N hints represents just
# 1 regexp.
foreach my $str (@bases) {
my $len = length fixup_re($str); # bug 6143: count decoded characters
if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }
}
}
if ($is_a_replace_rule || !$minlen || !@bases) {
dbg("zoom: ignoring rule %s, %s", $name,
$is_a_replace_rule ? 'is a replace rule'
: !@bases ? 'no bases' : 'no minlen');
push @failed, { orig => $rule };
$cached->{rule_bases}->{$cachekey} = { };
$no++;
}
else {
# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
# figure out if we have e.g. ["foo", "foob", "foobar"]; in this
# case, we only need to track ["foo"].
my %subsumed;
foreach my $base1 (@bases) {
foreach my $base2 (@bases) {
if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
$subsumed{$base1} = 1; # base2 is inside base1; discard the longer
}
}
}
my @forcache;
foreach my $base (@bases) {
next if $subsumed{$base};
push @good_bases, {
base => $base, orig => $rule, name => "$name,[l=$lossy]"
};
# *separate* copies for cache -- we modify the @good_bases entry
push @forcache, {
base => $base, orig => $rule, name => "$name,[l=$lossy]"
};
}
$cached->{rule_bases}->{$cachekey} = { g => \@forcache };
$yes++;
}
}
$self->{show_progress} and $progress and $progress->final();
dbg("zoom: $ruletype: found ".(scalar @good_bases).
" usable base strings in $yes rules, skipped $no rules");
# NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
# subsumes other patterns, we need to return hits for all of them. We also
# need to take care of the case where multiple regexps wind up sharing the
# same base.
#
# Another gotcha, an exception to the subsumption rule; if one pattern isn't
# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
# returned as two hits, correctly. So we only have to be smart about the
# full-subsumption case; overlapping is taken care of for us, by re2c.
#
# TODO: there's a bug here. Since the code in extract_hints() has been
# modified to support more complex regexps, we can no longer simply assume
# that if pattern A is not contained in pattern B, that means that pattern B
# doesn't subsume it. Consider, for example, A="foo*bar" and
# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
# that without running the A RE match itself somehow against B.
# same issue remains with:
#
# "foo?bar" / "fobar"
# "fo(?:o|oo|)bar" / "fobar"
# "fo(?:o|oo)?bar" / "fobar"
# "fo(?:o*|baz)bar" / "fobar"
# "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
#
# it's worse with this:
#
# "fo(?:o|oo|)bar" / "foo*bar"
#
# basically, this is impossible to compute without reimplementing most of
# re2c, and it appears the re2c developers don't plan to offer this:
# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
$conf->{base_orig}->{$ruletype} = { };
$conf->{base_string}->{$ruletype} = { };
$count = 0;
$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
total => (scalar @good_bases || 1),
itemtype => 'bases',
});
# this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
# array -- into a more efficient format, using arrays and with a little
# bit of precomputation, to go (quite a bit) faster
my @rewritten;
foreach my $set1 (@good_bases) {
my $base = $set1->{base};
next if (!$base || !$set1->{name});
push @rewritten, [
$base, # 0 - SLOT_BASE
$set1->{name}, # 1 - SLOT_NAME
$set1->{orig}, # 2 - SLOT_ORIG
length $base, # 3 - SLOT_LEN_BASE
$base, # 4 - SLOT_BASE_INITIAL
0 # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
];
}
@good_bases = sort {
$b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] ||
$a->[SLOT_BASE] cmp $b->[SLOT_BASE] ||
$a->[SLOT_NAME] cmp $b->[SLOT_NAME] ||
$a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
} @rewritten;
my $base_orig = $conf->{base_orig}->{$ruletype};
my $next_base_position = 0;
for my $set1 (@good_bases) {
$next_base_position++;
$self->{show_progress} and $progress and $progress->update(++$count);
my $base1 = $set1->[SLOT_BASE] or next; # got clobbered
my $name1 = $set1->[SLOT_NAME];
my $orig1 = $set1->[SLOT_ORIG];
my $len1 = $set1->[SLOT_LEN_BASE];
$base_orig->{$name1} = $orig1;
foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
# clobber false and exact dups; this can happen if a regexp outputs the
# same base string multiple times
if (!$set2->[SLOT_BASE] ||
(
$base1 eq $set2->[SLOT_BASE] &&
$name1 eq $set2->[SLOT_NAME] &&
$orig1 eq $set2->[SLOT_ORIG]
)
)
{
#dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
$set2->[SLOT_BASE] = CLOBBER; # clobber
next;
}
# Cannot be a subset if it does not contain the other base string
next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
# skip if either already contains the other rule's name
# optimize: this can only happen if the base has more than
# one rule already attached, ie [5]
next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: |^)\Q$name1\E(?: |$)/);
# don't use $name1 here, since another base in the set2 loop
# may have added $name2 since we set that
next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: |^)\Q$set2->[SLOT_NAME]\E(?: |$)/);
# $set2->[SLOT_BASE] is just a subset of base1
#dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
$set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
$set1->[SLOT_HAS_MULTIPLE] = 1;
}
}