sub _check_attachments()

in lib/Mail/SpamAssassin/Plugin/MIMEEval.pm [300:505]


sub _check_attachments {
  my ($self, $pms) = @_;

  # MIME status
  my $where = -1;		# -1 = start, 0 = nowhere, 1 = header, 2 = body
  my $qp_bytes = 0;		# total bytes in QP regions
  my $qp_count = 0;		# QP-encoded bytes in QP regions
  my @part_bytes;		# MIME part total bytes
  my @part_type;		# MIME part types

  my $normal_chars = 0;		# MIME text bytes that aren't encoded
  my $unicode_chars = 0;	# MIME text bytes that are unicode entities

  # MIME header information
  my $part = -1;		# MIME part index

  # indicate the scan has taken place
  $pms->{mime_checked_attachments} = 1;

  # results
# $pms->{mime_base64_blanks} = 0;  # expensive to determine, no longer avail
  $pms->{mime_base64_count} = 0;
  $pms->{mime_base64_encoded_text} = 0;
  # $pms->{mime_base64_illegal} = 0;
  # $pms->{mime_base64_latin} = 0;
  $pms->{mime_body_html_count} = 0;
  $pms->{mime_body_text_count} = 0;
  $pms->{mime_faraway_charset} = 0;
  # $pms->{mime_html_no_charset} = 0;
  $pms->{mime_missing_boundary} = 0;
  $pms->{mime_multipart_alternative} = 0;
  $pms->{mime_multipart_ratio} = 1.0;
  $pms->{mime_qp_count} = 0;
  # $pms->{mime_qp_illegal} = 0;
  # $pms->{mime_qp_inline_no_charset} = 0;
  $pms->{mime_qp_long_line} = 0;
  $pms->{mime_qp_ratio} = 0;
  $pms->{mime_ascii_text_illegal} = 0;
  $pms->{mime_text_unicode_ratio} = 0;

  # Get all parts ...
  foreach my $p ($pms->{msg}->find_parts(qr/./)) {
    # message headers
    my ($ctype, $boundary, $charset, $name) = Mail::SpamAssassin::Util::parse_content_type($p->get_header("content-type"));

    if ($ctype eq 'multipart/alternative') {
      $pms->{mime_multipart_alternative} = 1;
    }

    my $cte = $p->get_header('Content-Transfer-Encoding') || '';
    chomp($cte = defined($cte) ? lc $cte : "");

    my $cd = $p->get_header('Content-Disposition') || '';
    chomp($cd = defined($cd) ? lc $cd : "");

    $charset = lc $charset if ($charset);
    $name = lc $name if ($name);

    $self->_check_mime_header($pms, $ctype, $cte, $cd, $charset, $name);

    # If we're not in a leaf node in the tree, there will be no raw
    # section, so skip it.
    if (! $p->is_leaf()) {
      next;
    }

    $part++;
    $part_type[$part] = $ctype;
    $part_bytes[$part] = 0 if index($cd, 'attachment') == -1;

    my $cte_is_base64 = index($cte, 'base64') >= 0;
    my $previous = '';
    foreach (@{$p->raw()}) {

    # if ($cte_is_base64) {
    #   if ($previous =~ /^\s*$/ && /^\s*$/) {  # expensive, avoid!
    #     $pms->{mime_base64_blanks} = 1;  # never used, don't bother
    #   }
    #   # MIME_BASE64_ILLEGAL: now a zero-hitter
    #   # if (m@[^A-Za-z0-9+/=\n]@ || /=[^=\s]/) {
    #   # $pms->{mime_base64_illegal} = 1;
    #   # }
    # }

      # if ($pms->{mime_html_no_charset} && $ctype eq 'text/html' && defined $charset) {
      # $pms->{mime_html_no_charset} = 0;
      # }
      if ($pms->{mime_multipart_alternative} && index($cd, 'attachment') == -1 &&
          ($ctype eq 'text/plain' || $ctype eq 'text/html')) {
	$part_bytes[$part] += length;
      }

      if ($where != 1 && $cte eq "quoted-printable" && index($_, 'SPAM: ') != 0) {
        # RFC 5322: Each line SHOULD be no more than 78 characters,
        #           excluding the CRLF.
        # RFC 2045: The Quoted-Printable encoding REQUIRES that
        #           encoded lines be no more than 76 characters long.
        # Bug 5491: 6% of email classified as HAM by SA triggered the
        #           MIME_QP_LONG_LINE rule. Apple Mail can generate a QP-line
        #           that is 2 chars too long. Same goes for Outlook Web Access.
        # lines include one trailing \n character
      # if (length > 76+1) {  # conforms to RFC 5322 and RFC 2045
        if (length > 78+1) {  # conforms to RFC 5322 only, not RFC 2045
	  $pms->{mime_qp_long_line} = 1;
        }
        $qp_bytes += length;

        # MIME_QP_DEFICIENT: zero-hitter now

        # check for illegal substrings (RFC 2045), hexadecimal values 7F-FF and
        # control characters other than TAB, or CR and LF as parts of CRLF pairs
        # if (!$pms->{mime_qp_illegal} && /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/)
        # {
        # $pms->{mime_qp_illegal} = 1;
        # }

        # count excessive QP bytes
        if (index($_, '=') >= 0) {
## no critic (Perlsecret)
	  # whoever wrote this next line is an evil hacker -- jm
	  my $qp = () = m/=(?:09|3[0-9ABCEF]|[2456][0-9A-F]|7[0-9A-E])/g;
## use critic
	  if ($qp) {
	    $qp_count += $qp;
	    # tabs and spaces at end of encoded line are okay.  Also, multiple
	    # whitespace at the end of a line are OK, like ">=20=20=20=20=20=20".
	    my ($trailing) = m/((?:=09|=20)+)\s*$/g;
	    if ($trailing) {
	      $qp_count -= (length($trailing) / 3);
	    }
	  }
        }
      }

      # if our charset is ASCII, this should only contain 7-bit characters
      # except NUL or a free-standing CR. anything else is a violation of
      # the definition of charset="us-ascii".
      if ($ctype eq 'text/plain' && (!defined $charset || $charset eq 'us-ascii')) {
        # no re "strict";  # since perl 5.21.8: Ranges of ASCII printables...
        if (m/[\x00\x0d\x80-\xff]+/) {
          if (would_log('dbg', 'eval')) {
            my $str = $_;
            $str =~ s/([\x00\x0d\x80-\xff]+)/'<' . unpack('H*', $1) . '>'/eg;
            dbg("check: ascii_text_illegal: matches " . $str . "\n");
          }
          $pms->{mime_ascii_text_illegal}++;
        }
      }

      # if we're text/plain, we should never see unicode escapes in this
      # format, especially not for 7bit or 8bit.
      if ($ctype eq 'text/plain' && ($cte eq '' || $cte eq '7bit' || $cte eq '8bit')) {
        my ($text, $subs) = $_;

        $subs = $text =~ s/&#x[0-9A-F]{4};//g;
        $normal_chars += length($text);
        $unicode_chars += $subs;

        if ($subs && would_log('dbg', 'eval')) {
          my $str = $_;
          $str = substr($str, 0, 512) . '...' if (length($str) > 512);
          dbg("check: abundant_unicode: " . $str . " (" . $subs . ")\n");
        }
      }

      $previous = $_;
    }
  }

  if ($qp_bytes) {
    $pms->{mime_qp_ratio} = $qp_count / $qp_bytes;
    $pms->{mime_qp_count} = $qp_count;
    $pms->{mime_qp_bytes} = $qp_bytes;
  }

  if ($normal_chars) {
    $pms->{mime_text_unicode_ratio} = $unicode_chars / $normal_chars;
  }

  if ($pms->{mime_multipart_alternative}) {
    my $text;
    my $html;
    # bug 4207: we want the size of the last parts
    for (my $i = $part; $i >= 0; $i--) {
      next if !defined $part_bytes[$i];
      if (!defined($html) && $part_type[$i] eq 'text/html') {
	$html = $part_bytes[$i];
      }
      elsif (!defined($text) && $part_type[$i] eq 'text/plain') {
	$text = $part_bytes[$i];
      }
      last if (defined($html) && defined($text));
    }
    if (defined($text) && defined($html) && $html > 0) {
      $pms->{mime_multipart_ratio} = ($text / $html);
    }
  }

  # Look to see if any multipart boundaries are not "balanced"
  foreach my $val (values %{$pms->{msg}->{mime_boundary_state}}) {
    if ($val != 0) {
      $pms->{mime_missing_boundary} = 1;
      last;
    }
  }
}