sub _process_text_uri_list()

in lib/Mail/SpamAssassin/PerMsgStatus.pm [2692:2794]


sub _process_text_uri_list {
  my ($self) = @_;

  # Use decoded stripped body, which does not contain HTML
  my $textary = $self->get_decoded_stripped_body_text_array();
  my $tbirdurire = $self->_tbirdurire;
  my %seen;
  my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache

  foreach my $text (@$textary) {
    # a workaround for [perl #69973] bug:
    # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
    # Bug 6225, regexp and string should both be utf8, or none of them;
    # untainting string also seems to avoid the crash
    #
    # Bug 6225: untaint the string in an attempt to work around a perl crash
    local $_ = untaint_var($text);

    local($1,$2,$3);
    while (/$tbirdurire/igo) {
      my $rawuri = $1||$2||$3;
      my $schost = $4;
      my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
      $rawuri =~ s/(^[^(]*)\).*$/$1/;  # as per ThunderBird, ) is an end delimiter if there is no ( preceding it
      $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores

      next if exists $seen{$rawuri};
      $seen{$rawuri} = 1;

      # Ignore bogus mail captures (@ might have been trimmed from the end above..)
      next if $rawtype eq 'mail' && index($rawuri, '@') == -1;

      dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;

      # Quick ignore if schemeless host not valid
      next if defined $schost && !is_fqdn_valid($schost, 1);

      # Ignore cid: mid: as they can be mistaken for emails,
      # these should not be parsed from stripped body in any case.
      # Example: [cid:image001.png@01D4986E.E3459640]
      next if $rawuri =~ /^[cm]id:/i;

      # Ignore empty uris
      next if $rawuri =~ /^\w+:\/{0,2}$/i;

      my $types = {parsed => 1};

      # If it's a hostname that was just sitting out in the
      # open, without a protocol, and not inside of an HTML tag,
      # the we should add the proper protocol in front, rather
      # than using the base URI.
      my $uri = $rawuri;
      if ($uri !~ /^(?:https?|ftp|mailto):/i) {
        if ($uri =~ /^ftp\./i) {
          $uri = "ftp://$uri";
        }
        elsif ($uri =~ /^www\d{0,2}\./i) {
          $uri = "http://$uri";
        }
        elsif ($uri =~ /\/.+\@/) {
          # if a "/" is found before @ it cannot be a valid email address
          $uri = "http://$uri";
        }
        elsif (index($uri, '@') != -1) {
          # This is not linkified by MUAs: foo@bar%2Ecom
          # This IS linkified: foo@bar%2Ebar.com
          # And this is linkified: foo@bar%2Ecom?foo.com&bar  (woot??)
          # And this is linkified with Outlook: foo@bar%2Ecom&foo  (woot??)
          # ...
          # Skip if no dot found after @, tested without urldecoding,
          # quick skip for crap like Vi@gra.
          next unless $uri =~ /\@.+?\./;
          next if index($uri, ' ') != -1; # ignore garbled
          $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
          $uri = "mailto:$uri";
        }
        else {
          # some spammers are using unschemed URIs to escape filters
          # flag that this is a URI that MUAs don't linkify so only use for RBLs
          # (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
          next if index($uri, '.') == -1; # skip unless dot found, garbage
          $uri = "http://$uri";
          $types->{unlinked} = 1;
        }
        # Mark any of those schemeless
        $types->{schemeless} = 1;
      }

      if ($uri =~ /^mailto:/i) {
        # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
        $uri = Mail::SpamAssassin::Util::url_decode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
        # Skip unless @ found after decoding, then check tld is valid
        next unless $uri =~ /\@([^?&>]*)/;
        my $host = $1; $host =~ s/(?:\%20)+$//; # strip trailing %20 from host
        next unless $self->{main}->{registryboundaries}->is_domain_valid($host);
      }

      dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;

      $self->add_uri_detail_list($uri, $types, 'parsed', 1);
    }
  }
}