in lib/Mail/SpamAssassin/PerMsgStatus.pm [2692:2794]
sub _process_text_uri_list {
my ($self) = @_;
# Use decoded stripped body, which does not contain HTML
my $textary = $self->get_decoded_stripped_body_text_array();
my $tbirdurire = $self->_tbirdurire;
my %seen;
my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
foreach my $text (@$textary) {
# a workaround for [perl #69973] bug:
# Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
# Bug 6225, regexp and string should both be utf8, or none of them;
# untainting string also seems to avoid the crash
#
# Bug 6225: untaint the string in an attempt to work around a perl crash
local $_ = untaint_var($text);
local($1,$2,$3);
while (/$tbirdurire/igo) {
my $rawuri = $1||$2||$3;
my $schost = $4;
my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
$rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceding it
$rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
next if exists $seen{$rawuri};
$seen{$rawuri} = 1;
# Ignore bogus mail captures (@ might have been trimmed from the end above..)
next if $rawtype eq 'mail' && index($rawuri, '@') == -1;
dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
# Quick ignore if schemeless host not valid
next if defined $schost && !is_fqdn_valid($schost, 1);
# Ignore cid: mid: as they can be mistaken for emails,
# these should not be parsed from stripped body in any case.
# Example: [cid:image001.png@01D4986E.E3459640]
next if $rawuri =~ /^[cm]id:/i;
# Ignore empty uris
next if $rawuri =~ /^\w+:\/{0,2}$/i;
my $types = {parsed => 1};
# If it's a hostname that was just sitting out in the
# open, without a protocol, and not inside of an HTML tag,
# the we should add the proper protocol in front, rather
# than using the base URI.
my $uri = $rawuri;
if ($uri !~ /^(?:https?|ftp|mailto):/i) {
if ($uri =~ /^ftp\./i) {
$uri = "ftp://$uri";
}
elsif ($uri =~ /^www\d{0,2}\./i) {
$uri = "http://$uri";
}
elsif ($uri =~ /\/.+\@/) {
# if a "/" is found before @ it cannot be a valid email address
$uri = "http://$uri";
}
elsif (index($uri, '@') != -1) {
# This is not linkified by MUAs: foo@bar%2Ecom
# This IS linkified: foo@bar%2Ebar.com
# And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
# And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
# ...
# Skip if no dot found after @, tested without urldecoding,
# quick skip for crap like Vi@gra.
next unless $uri =~ /\@.+?\./;
next if index($uri, ' ') != -1; # ignore garbled
$uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
$uri = "mailto:$uri";
}
else {
# some spammers are using unschemed URIs to escape filters
# flag that this is a URI that MUAs don't linkify so only use for RBLs
# (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
next if index($uri, '.') == -1; # skip unless dot found, garbage
$uri = "http://$uri";
$types->{unlinked} = 1;
}
# Mark any of those schemeless
$types->{schemeless} = 1;
}
if ($uri =~ /^mailto:/i) {
# MUAs linkify and urldecode mailto:foo%40bar%2Fcom
$uri = Mail::SpamAssassin::Util::url_decode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
# Skip unless @ found after decoding, then check tld is valid
next unless $uri =~ /\@([^?&>]*)/;
my $host = $1; $host =~ s/(?:\%20)+$//; # strip trailing %20 from host
next unless $self->{main}->{registryboundaries}->is_domain_valid($host);
}
dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
$self->add_uri_detail_list($uri, $types, 'parsed', 1);
}
}
}