in lib/Mail/SpamAssassin/Plugin/Bayes.pm [1219:1385]
sub _tokenize_line {
my $self = $_[0];
my $tokprefix = $_[2];
my $region = $_[3];
local ($_) = $_[1];
my $conf = $self->{conf};
my @rettokens;
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
s{ ( [A-Za-z0-9,@*!_'"\$. -]+ |
[\xC0-\xDF][\x80-\xBF] |
[\xE0-\xEF][\x80-\xBF]{2} |
[\xF0-\xF4][\x80-\xBF]{3} |
[\xA1-\xFF] ) | . }
{ defined $1 ? $1 : ' ' }xsge;
# should we also turn NBSP ( \xC2\xA0 ) into space?
# DO split on "..." or "--" or "---"; common formatting error resulting in
# hapaxes. Keep the separator itself as a token, though, as long ones can
# be good spamsigns.
s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
if (IGNORE_TITLE_CASE) {
if ($region == 1 || $region == 2) {
# lower-case Title Case at start of a full-stop-delimited line (as would
# be seen in a Western language).
s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
}
}
my $magic_re = $self->{store}->get_magic_re();
# Note that split() in scope of 'use bytes' results in words with utf8 flag
# cleared, even if the source string has perl characters semantics !!!
# Is this really still desirable?
TOKEN: foreach my $token (split) {
$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
# Skip false magic tokens
# TVD: we need to do a defined() check since SQL doesn't have magic
# tokens, so the SQL BayesStore returns undef. I really want a way
# of optimizing that out, but I haven't come up with anything yet.
#
next if ( defined $magic_re && $token =~ /$magic_re/o );
# *do* keep 3-byte tokens; there's some solid signs in there
my $len = length($token);
# but extend the stop-list. These are squarely in the gray
# area, and it just slows us down to record them.
# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
#
next if $len < 3;
# check stopwords regexp if not cached
if (@{$conf->{bayes_stopword_languages}}) {
if (!exists $self->{stopword_cache}{$token}) {
foreach my $lang (@{$conf->{bayes_stopword_languages}}) {
if ($token =~ $self->{bayes_stopword}{$lang}) {
dbg("bayes: skipped token '$token' because it's in stopword list for language '$lang'");
$self->{stopword_cache}{$token} = 1;
next TOKEN;
}
}
$self->{stopword_cache}{$token} = 0;
} else {
# bail out if cached known
next if $self->{stopword_cache}{$token};
}
}
# are we in the body? If so, apply some body-specific breakouts
if ($region == 1 || $region == 2) {
if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
push (@rettokens, $self->_tokenize_mail_addrs ($token));
}
elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
push (@rettokens, "UD:".$token); # the full token
my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
push (@rettokens, "UD:".$1); # UD = URL domain
}
}
}
# note: do not trim down overlong tokens if they contain '*'. This is
# used as part of split tokens such as "HTo:D*net" indicating that
# the domain ".net" appeared in the To header.
#
if ($len > $conf->{bayes_max_token_length} && index($token, '*') == -1) {
if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
# Bug 7135
# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
[\x80-\xBF]{2} )/xsg;
if (@t) {
push (@rettokens, map($tokprefix.'u8:'.$_, @t));
next;
}
}
if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
# to me! (jm)
while ($token =~ s/^(..?)//) {
push (@rettokens, $tokprefix.'8:'.$1);
}
next;
}
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
{
# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
# Spambayes trick via Matt: Just retain 7 chars. Do not retain the
# length, it does not help; see jm's mail to -devel on Nov 20 2002 at
# http://sourceforge.net/p/spamassassin/mailman/message/12977605/
# "sk:" stands for "skip".
# Bug 7141: retain seven UTF-8 chars (or other bytes),
# if followed by at least two bytes
$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] |
[\xC0-\xDF][\x80-\xBF] |
[\xE0-\xEF][\x80-\xBF]{2} |
[\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
.{2,} \z }{sk:$1}xs;
## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
}
}
# decompose tokens? do this after shortening long tokens
if ($region == 1 || $region == 2) {
if (DECOMPOSE_BODY_TOKENS) {
if ($token =~ /[^\w:\*]/) {
my $decompd = $token; # "Foo!"
$decompd =~ s/[^\w:\*]//gs;
push (@rettokens, $tokprefix.$decompd); # "Foo"
}
if ($token =~ /[A-Z]/) {
my $decompd = $token; $decompd = lc $decompd;
push (@rettokens, $tokprefix.$decompd); # "foo!"
if ($token =~ /[^\w:\*]/) {
$decompd =~ s/[^\w:\*]//gs;
push (@rettokens, $tokprefix.$decompd); # "foo"
}
}
}
}
push (@rettokens, $tokprefix.$token);
}
return @rettokens;
}