sub _tokenize_line()

in lib/Mail/SpamAssassin/Plugin/Bayes.pm [1219:1385]


sub _tokenize_line {
  my $self = $_[0];
  my $tokprefix = $_[2];
  my $region = $_[3];
  local ($_) = $_[1];

  my $conf = $self->{conf};
  my @rettokens;

  # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
  # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
  # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"

  ### (previous:)  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;

  ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
  s{ ( [A-Za-z0-9,@*!_'"\$. -]+  |
       [\xC0-\xDF][\x80-\xBF]    |
       [\xE0-\xEF][\x80-\xBF]{2} |
       [\xF0-\xF4][\x80-\xBF]{3} |
       [\xA1-\xFF] ) | . }
   { defined $1 ? $1 : ' ' }xsge;
  # should we also turn NBSP ( \xC2\xA0 ) into space?

  # DO split on "..." or "--" or "---"; common formatting error resulting in
  # hapaxes.  Keep the separator itself as a token, though, as long ones can
  # be good spamsigns.
  s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
  s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;

  if (IGNORE_TITLE_CASE) {
    if ($region == 1 || $region == 2) {
      # lower-case Title Case at start of a full-stop-delimited line (as would
      # be seen in a Western language).
      s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
    }
  }

  my $magic_re = $self->{store}->get_magic_re();

  # Note that split() in scope of 'use bytes' results in words with utf8 flag
  # cleared, even if the source string has perl characters semantics !!!
  # Is this really still desirable?

TOKEN: foreach my $token (split) {
    $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end
    $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens

    # Skip false magic tokens
    # TVD: we need to do a defined() check since SQL doesn't have magic
    # tokens, so the SQL BayesStore returns undef.  I really want a way
    # of optimizing that out, but I haven't come up with anything yet.
    #
    next if ( defined $magic_re && $token =~ /$magic_re/o );

    # *do* keep 3-byte tokens; there's some solid signs in there
    my $len = length($token);

    # but extend the stop-list. These are squarely in the gray
    # area, and it just slows us down to record them.
    # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
    #
    next if $len < 3;

    # check stopwords regexp if not cached
    if (@{$conf->{bayes_stopword_languages}}) {
      if (!exists $self->{stopword_cache}{$token}) {
        foreach my $lang (@{$conf->{bayes_stopword_languages}}) {
          if ($token =~ $self->{bayes_stopword}{$lang}) {
            dbg("bayes: skipped token '$token' because it's in stopword list for language '$lang'");
            $self->{stopword_cache}{$token} = 1;
            next TOKEN;
          }
        }
        $self->{stopword_cache}{$token} = 0;
      } else {
        # bail out if cached known
        next if $self->{stopword_cache}{$token};
      }
    }

    # are we in the body?  If so, apply some body-specific breakouts
    if ($region == 1 || $region == 2) {
      if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
	push (@rettokens, $self->_tokenize_mail_addrs ($token));
      }
      elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
	push (@rettokens, "UD:".$token); # the full token
	my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
	  push (@rettokens, "UD:".$1); # UD = URL domain
	}
      }
    }

    # note: do not trim down overlong tokens if they contain '*'.  This is
    # used as part of split tokens such as "HTo:D*net" indicating that 
    # the domain ".net" appeared in the To header.
    #
    if ($len > $conf->{bayes_max_token_length} && index($token, '*') == -1) {

      if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
	# Bug 7135
	# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
	my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
                              [\x80-\xBF]{2} )/xsg;
	if (@t) {
          push (@rettokens, map($tokprefix.'u8:'.$_, @t));
	  next;
	}
      }

      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
	# to me! (jm)
	while ($token =~ s/^(..?)//) {
	  push (@rettokens, $tokprefix.'8:'.$1);
	}
	next;
      }

      if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
            || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
            || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
      {
	# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
	# Spambayes trick via Matt: Just retain 7 chars.  Do not retain the
	# length, it does not help; see jm's mail to -devel on Nov 20 2002 at
	# http://sourceforge.net/p/spamassassin/mailman/message/12977605/
	# "sk:" stands for "skip".
	# Bug 7141: retain seven UTF-8 chars (or other bytes),
	# if followed by at least two bytes
	$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF]      |
	                         [\xC0-\xDF][\x80-\xBF]    |
	                         [\xE0-\xEF][\x80-\xBF]{2} |
	                         [\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
	             .{2,} \z }{sk:$1}xs;
	## (was:)  $token = "sk:".substr($token, 0, 7);  # seven bytes
      }
    }

    # decompose tokens?  do this after shortening long tokens
    if ($region == 1 || $region == 2) {
      if (DECOMPOSE_BODY_TOKENS) {
        if ($token =~ /[^\w:\*]/) {
          my $decompd = $token;                        # "Foo!"
          $decompd =~ s/[^\w:\*]//gs;
          push (@rettokens, $tokprefix.$decompd);      # "Foo"
        }

        if ($token =~ /[A-Z]/) {
          my $decompd = $token; $decompd = lc $decompd;
          push (@rettokens, $tokprefix.$decompd);      # "foo!"

          if ($token =~ /[^\w:\*]/) {
            $decompd =~ s/[^\w:\*]//gs;
            push (@rettokens, $tokprefix.$decompd);    # "foo"
          }
        }
      }
    }

    push (@rettokens, $tokprefix.$token);
  }

  return @rettokens;
}