in lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm [739:959]
sub unroll_branches {
my ($self, $depth, $opslist) = @_;
die "too deep" if ($depth++ > 5);
my @ops = (@{$opslist}); # copy
my @pre_branch_ops;
my $branch_spcs;
my $trie_spcs;
my $open_spcs;
# our input looks something like this 2-level structure:
# 1: BOUND(2)
# 2: EXACT <Dear >(5)
# 5: BRANCH(9)
# 6: EXACT <IT>(8)
# 8: NALNUM(24)
# 9: BRANCH(23)
# 10: EXACT <Int>(12)
# 12: BRANCH(14)
# 13: NOTHING(21)
# 14: BRANCH(17)
# 15: EXACT <a>(21)
# 17: BRANCH(20)
# 18: EXACT <er>(21)
# 20: TAIL(21)
# 21: EXACT <net>(24)
# 23: TAIL(24)
# 24: EXACT < shop>(27)
# 27: END(0)
#
# or:
#
# 1: OPEN1(3)
# 3: BRANCH(6)
# 4: EXACT <v>(9)
# 6: BRANCH(9)
# 7: EXACT <\\/>(9)
# 9: CLOSE1(11)
# 11: CURLY {2,5}(14)
# 13: REG_ANY(0)
# 14: EXACT < g r a >(17)
# 17: ANYOF[a-z](28)
# 28: END(0)
#
# or:
#
# 1: EXACT <i >(3)
# 3: OPEN1(5)
# 5: TRIE-EXACT[am](21)
# <am> (21)
# <might> (12)
# 12: OPEN2(14)
# 14: TRIE-EXACT[ ](19)
# < be>
# <>
# 19: CLOSE2(21)
# 21: CLOSE1(23)
# 23: EXACT < c>(25)
DEBUG_RE_PARSING and warn "starting parse";
# this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform
# it into the latter. bit of a kludge to do this before the loop, but hey.
# note that it doesn't fix the CLOSE1/END ordering to be correct
if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
my @newops = ([ "", "OPEN1", "" ]);
foreach my $op (@ops) {
push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
}
push @newops, [ "", "CLOSE1", "" ];
@ops = @newops;
}
# iterate until we start a branch set. using
# /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."
# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
# or a BRANCH, *then* unroll the most recent OPEN set.
while (1) {
my $op = shift @ops;
last unless defined $op;
my ($spcs, $item, $args) = @{$op};
DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
if ($item =~ /^OPEN/) {
$open_spcs = $spcs;
next; # next will be a BRANCH or TRIE
} elsif ($item =~ /^TRIE/) {
$trie_spcs = $spcs;
last;
} elsif ($item =~ /^BRANCH/) {
$branch_spcs = $spcs;
last;
} elsif ($item =~ /^EXACT/ && defined $open_spcs) {
# perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT
push @pre_branch_ops, [ $open_spcs, $item, $args ];
next;
} elsif (defined $open_spcs) {
# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
# ignore this OPEN block entirely and don't try to unroll it
undef $open_spcs;
} else {
push @pre_branch_ops, $op;
}
}
# no branches found? we're done unrolling on this one!
if (scalar @ops == 0) {
return [ @pre_branch_ops ];
}
# otherwise we're at the start of a new branch set
# /(foo|bar(baz|argh)boo)gab/
my @alts;
my @in_this_branch;
DEBUG_RE_PARSING and warn "entering branch: ".
"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
# indentation level to remove from "normal" ops (using a s///)
my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
my $trie_sub_spcs = "";
while (1) {
my $op = shift @ops;
last unless defined $op;
my ($spcs, $item, $args) = @{$op};
DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
push @alts, [ @pre_branch_ops, @in_this_branch ];
@in_this_branch = ();
$open_sub_spcs = $branch_spcs." ";
$trie_sub_spcs = "";
next;
}
elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
push @alts, [ @pre_branch_ops, @in_this_branch ];
undef $branch_spcs;
$open_sub_spcs = "";
$trie_sub_spcs = "";
last;
}
elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
if (scalar @in_this_branch > 0) {
push @alts, [ @pre_branch_ops, @in_this_branch ];
}
# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
@in_this_branch = ( [ $open_spcs, $item, $args ] );
$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
$trie_sub_spcs = " ";
next;
}
elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
push @alts, [ @pre_branch_ops, @in_this_branch ];
undef $branch_spcs;
undef $open_spcs;
undef $trie_spcs;
$open_sub_spcs = "";
$trie_sub_spcs = "";
last;
}
elsif ($item eq 'END') { # of string
push @alts, [ @pre_branch_ops, @in_this_branch ];
undef $branch_spcs;
undef $open_spcs;
undef $trie_spcs;
$open_sub_spcs = "";
$trie_sub_spcs = "";
last;
}
else {
if ($open_sub_spcs) {
# deindent the space-level to match the opening brace
$spcs =~ s/^$open_sub_spcs//;
# tries also add one more indent level in
$spcs =~ s/^$trie_sub_spcs//;
}
push @in_this_branch, [ $spcs, $item, $args ];
# note that we ignore ops at a deeper $spcs level entirely (until later!)
}
}
if (defined $branch_spcs) {
die "fell off end of string with a branch open: '$branch_spcs'";
}
# we're now after the branch set: /gab/
# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]
foreach my $alt (@alts) {
push @{$alt}, @ops; # add all remaining ops to each one
# note that this could include more (?:...); we don't care, since
# those can be handled by recursing
}
# ok, parsed the entire ops list
# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]
if (DEBUG_RE_PARSING) {
print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
}
# now recurse, to unroll the remaining branches (if any exist)
my @rets;
foreach my $alt (@alts) {
push @rets, $self->unroll_branches($depth, $alt);
}
if (DEBUG_RE_PARSING) {
print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
}
return @rets;
}