sub _parse_multipart()

in lib/Mail/SpamAssassin/Message.pm [934:1157]


sub _parse_multipart {
  my($self, $toparse) = @_;

  my ($msg, $boundary, $body, $subparse) = @{$toparse};
  my $nested_boundary = 0;

  # we're not supposed to be a leaf, so prep ourselves
  $msg->{'body_parts'} = [];

  # the next set of objects will be one level deeper
  $subparse--;

  dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));

  # NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
  # compliant, but also allow possible malformations to still work.  Please
  # see Bugzilla bug 3749 for more information before making any changes!

  # ignore preamble per RFC 1521, unless there's no boundary ...
  if ( defined $boundary ) {
    my $line;
    my $tmp_line = @{$body};
    for ($line=0; $line < $tmp_line; $line++) {
#     dbg("message: multipart line $line: \"" . $body->[$line] . "\"");
      # specifically look for an opening boundary
      if (substr($body->[$line],0,2) eq '--'  # triage
          && $body->[$line] =~ /^--\Q$boundary\E\s*$/) {
	# Make note that we found the opening boundary
	$self->{mime_boundary_state}->{$boundary} = 1;

	# if the line after the opening boundary isn't a header, flag it.
	# we need to make sure that there's actually another line though.
	# no re "strict";  # since perl 5.21.8: Ranges of ASCII printables...
	if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
	  $self->{'missing_mime_headers'} = 1;
	}

        last;
      }
    }

    # Found a boundary, ignore the preamble
    if ( $line < $tmp_line ) {
      splice @{$body}, 0, $line+1;
    }

    # Else, there's no boundary, so leave the whole part...
  }

  # prepare a new tree node
  my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
  my $in_body = 0;
  my $header;
  my $part_array;
  my $found_end_boundary;
  my $found_last_end_boundary;
  my $partcnt = 0;

  my $line_count = @{$body};
  foreach ( @{$body} ) {
    # if we're on the last body line, or we find any boundary marker,
    # deal with the mime part;
    # a triage before an unlikely-to-match regexp avoids a CPU hotspot
    $found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
                          && /^--\Q$boundary\E(--)?\s*$/;
    $found_last_end_boundary = $found_end_boundary && $1;
    if ($found_end_boundary && $nested_boundary) {
      $found_end_boundary = 0;
      $nested_boundary = 0 if ($found_last_end_boundary); # bug 7358 - handle one level of non-unique boundary string
    }
    if ( --$line_count == 0 || $found_end_boundary ) {
      my $line = $_; # remember the last line

      # If at last line and no end boundary found, the line belongs to body
      # TODO:
      #  Is $self->{mime_boundary_state}->{$boundary}-- needed here?
      #  Could "missing end boundary" be a useful rule? Mark it somewhere?
      #  If SA processed truncated message from amavis etc, this could also
      #  be hit legimately..
      if (!$found_end_boundary) {
        # TODO: This is duplicate code from few pages down below..
        while (length ($_) > MAX_BODY_LINE_LENGTH) {
          push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
          substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
        }
        push ( @{$part_array}, $_ );
      }
      # per rfc 1521, the CRLF before the boundary is part of the boundary:
      # NOTE: The CRLF preceding the encapsulation line is conceptually
      # attached to the boundary so that it is possible to have a part
      # that does not end with a CRLF (line break). Body parts that must
      # be considered to end with line breaks, therefore, must have two
      # CRLFs preceding the encapsulation line, the first of which is part
      # of the preceding body part, and the second of which is part of the
      # encapsulation boundary.
      elsif ($part_array) {
        chomp( $part_array->[-1] );  # trim the CRLF that's part of the boundary
        splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
      }
      else {
        # Invalid parts can have no body, so fake in a blank body
	# in that case.
        $part_array = [];
      }

      ($part_msg->{'type'}, my $p_boundary, undef, undef, my $ct_was_missing) =
          Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));

      # bug 5741: if ct was missing and parent == multipart/digest, then
      # type should be set as message/rfc822
      if ($ct_was_missing) {
        if ($msg->{'type'} eq 'multipart/digest') {
          dbg("message: missing type, setting multipart/digest child as message/rfc822");
          $part_msg->{'type'} = 'message/rfc822';
        } else {
          dbg("message: missing type, setting as default text/plain");
        }
      }

      $p_boundary ||= $boundary;
      dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));

      # we've created a new node object, so add it to the queue along with the
      # text that belongs to that part, then add the new part to the current
      # node to create the tree.
      push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
      $msg->add_body_part($part_msg);

      if (defined $boundary) {
        if ($found_last_end_boundary) {
	  # Make a note that we've seen the end boundary
	  $self->{mime_boundary_state}->{$boundary}--;
          last;
        }
	elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
          # if we aren't on an end boundary and there are still lines left, it
	  # means we hit a new start boundary.  therefore, the next line ought
	  # to be a mime header.  if it's not, mark it.
	  $self->{'missing_mime_headers'} = 1;
	}
      }

      # Maximum parts to process, simply skip the rest of the parts
      if (++$partcnt == 1000) {
        dbg("message: mimepart limit exceeded, stopping parsing");
        $self->{'mimepart_limit_exceeded'} = 1;
        return;
      }

      # make sure we start with a new clean node
      $in_body  = 0;
      $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
      undef $part_array;
      undef $header;

      next;
    }

    if (!$in_body) {
      # s/\s+$//;   # bug 5127: don't clean this up (yet)
      # no re "strict";  # since perl 5.21.8: Ranges of ASCII printables...
      if (/^[\041-\071\073-\176]+[ \t]*:/) {
        if ($header) {
          my ( $key, $value ) = split ( /:\s*/, $header, 2 );
          $part_msg->header( $key, $value );
        }
        $header = $_;
	next;
      }
      elsif (/^[ \t]/ && $header) {
        # $_ =~ s/^\s*//;   # bug 5127, again
        $header .= $_;
	next;
      }
      else {
        if ($header) {
          my ( $key, $value ) = split ( /:\s*/, $header, 2 );
          $part_msg->header( $key, $value );
          if (defined $boundary && lc $key eq 'content-type') {
	    my (undef, $nested_bound) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
            if (defined $nested_bound && $nested_bound eq $boundary) {
       	      $nested_boundary = 1;
            }
          }
        }
        $in_body = 1;

	# if there's a blank line separator, that's good.  if there isn't,
	# it's a body line, so drop through.
	if (/^\r?$/) {
	  next;
	}
	else {
          $self->{'missing_mime_head_body_separator'} = 1;
	}
      }
    }

    # we run into a perl bug if the lines are astronomically long (probably
    # due to lots of regexp backtracking); so split any individual line
    # over MAX_BODY_LINE_LENGTH bytes in length.  This can wreck HTML
    # totally -- but IMHO the only reason a luser would use
    # MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
    while (length ($_) > MAX_BODY_LINE_LENGTH) {
      push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
      substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
    }
    push ( @{$part_array}, $_ );
  }

  # Look for a message epilogue
  # originally ignored whitespace:   0.185   0.2037   0.0654    0.757   0.00   0.00  TVD_TAB
  # ham FPs were all "." on a line by itself.
  # spams seem to only have NULL chars afterwards ?
  if ($line_count) {
    for(; $line_count > 0; $line_count--) {
      if ($body->[-$line_count] =~ /[^\s.]/) {
        $self->{mime_epilogue_exists} = 1;
        last;
      }
    }
  }

}