in lib/Mail/SpamAssassin/Message.pm [934:1157]
sub _parse_multipart {
my($self, $toparse) = @_;
my ($msg, $boundary, $body, $subparse) = @{$toparse};
my $nested_boundary = 0;
# we're not supposed to be a leaf, so prep ourselves
$msg->{'body_parts'} = [];
# the next set of objects will be one level deeper
$subparse--;
dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));
# NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
# compliant, but also allow possible malformations to still work. Please
# see Bugzilla bug 3749 for more information before making any changes!
# ignore preamble per RFC 1521, unless there's no boundary ...
if ( defined $boundary ) {
my $line;
my $tmp_line = @{$body};
for ($line=0; $line < $tmp_line; $line++) {
# dbg("message: multipart line $line: \"" . $body->[$line] . "\"");
# specifically look for an opening boundary
if (substr($body->[$line],0,2) eq '--' # triage
&& $body->[$line] =~ /^--\Q$boundary\E\s*$/) {
# Make note that we found the opening boundary
$self->{mime_boundary_state}->{$boundary} = 1;
# if the line after the opening boundary isn't a header, flag it.
# we need to make sure that there's actually another line though.
# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
$self->{'missing_mime_headers'} = 1;
}
last;
}
}
# Found a boundary, ignore the preamble
if ( $line < $tmp_line ) {
splice @{$body}, 0, $line+1;
}
# Else, there's no boundary, so leave the whole part...
}
# prepare a new tree node
my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
my $in_body = 0;
my $header;
my $part_array;
my $found_end_boundary;
my $found_last_end_boundary;
my $partcnt = 0;
my $line_count = @{$body};
foreach ( @{$body} ) {
# if we're on the last body line, or we find any boundary marker,
# deal with the mime part;
# a triage before an unlikely-to-match regexp avoids a CPU hotspot
$found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
&& /^--\Q$boundary\E(--)?\s*$/;
$found_last_end_boundary = $found_end_boundary && $1;
if ($found_end_boundary && $nested_boundary) {
$found_end_boundary = 0;
$nested_boundary = 0 if ($found_last_end_boundary); # bug 7358 - handle one level of non-unique boundary string
}
if ( --$line_count == 0 || $found_end_boundary ) {
my $line = $_; # remember the last line
# If at last line and no end boundary found, the line belongs to body
# TODO:
# Is $self->{mime_boundary_state}->{$boundary}-- needed here?
# Could "missing end boundary" be a useful rule? Mark it somewhere?
# If SA processed truncated message from amavis etc, this could also
# be hit legimately..
if (!$found_end_boundary) {
# TODO: This is duplicate code from few pages down below..
while (length ($_) > MAX_BODY_LINE_LENGTH) {
push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
}
push ( @{$part_array}, $_ );
}
# per rfc 1521, the CRLF before the boundary is part of the boundary:
# NOTE: The CRLF preceding the encapsulation line is conceptually
# attached to the boundary so that it is possible to have a part
# that does not end with a CRLF (line break). Body parts that must
# be considered to end with line breaks, therefore, must have two
# CRLFs preceding the encapsulation line, the first of which is part
# of the preceding body part, and the second of which is part of the
# encapsulation boundary.
elsif ($part_array) {
chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
}
else {
# Invalid parts can have no body, so fake in a blank body
# in that case.
$part_array = [];
}
($part_msg->{'type'}, my $p_boundary, undef, undef, my $ct_was_missing) =
Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
# bug 5741: if ct was missing and parent == multipart/digest, then
# type should be set as message/rfc822
if ($ct_was_missing) {
if ($msg->{'type'} eq 'multipart/digest') {
dbg("message: missing type, setting multipart/digest child as message/rfc822");
$part_msg->{'type'} = 'message/rfc822';
} else {
dbg("message: missing type, setting as default text/plain");
}
}
$p_boundary ||= $boundary;
dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
# we've created a new node object, so add it to the queue along with the
# text that belongs to that part, then add the new part to the current
# node to create the tree.
push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
$msg->add_body_part($part_msg);
if (defined $boundary) {
if ($found_last_end_boundary) {
# Make a note that we've seen the end boundary
$self->{mime_boundary_state}->{$boundary}--;
last;
}
elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
# if we aren't on an end boundary and there are still lines left, it
# means we hit a new start boundary. therefore, the next line ought
# to be a mime header. if it's not, mark it.
$self->{'missing_mime_headers'} = 1;
}
}
# Maximum parts to process, simply skip the rest of the parts
if (++$partcnt == 1000) {
dbg("message: mimepart limit exceeded, stopping parsing");
$self->{'mimepart_limit_exceeded'} = 1;
return;
}
# make sure we start with a new clean node
$in_body = 0;
$part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
undef $part_array;
undef $header;
next;
}
if (!$in_body) {
# s/\s+$//; # bug 5127: don't clean this up (yet)
# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
if (/^[\041-\071\073-\176]+[ \t]*:/) {
if ($header) {
my ( $key, $value ) = split ( /:\s*/, $header, 2 );
$part_msg->header( $key, $value );
}
$header = $_;
next;
}
elsif (/^[ \t]/ && $header) {
# $_ =~ s/^\s*//; # bug 5127, again
$header .= $_;
next;
}
else {
if ($header) {
my ( $key, $value ) = split ( /:\s*/, $header, 2 );
$part_msg->header( $key, $value );
if (defined $boundary && lc $key eq 'content-type') {
my (undef, $nested_bound) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
if (defined $nested_bound && $nested_bound eq $boundary) {
$nested_boundary = 1;
}
}
}
$in_body = 1;
# if there's a blank line separator, that's good. if there isn't,
# it's a body line, so drop through.
if (/^\r?$/) {
next;
}
else {
$self->{'missing_mime_head_body_separator'} = 1;
}
}
}
# we run into a perl bug if the lines are astronomically long (probably
# due to lots of regexp backtracking); so split any individual line
# over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML
# totally -- but IMHO the only reason a luser would use
# MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
while (length ($_) > MAX_BODY_LINE_LENGTH) {
push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
}
push ( @{$part_array}, $_ );
}
# Look for a message epilogue
# originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB
# ham FPs were all "." on a line by itself.
# spams seem to only have NULL chars afterwards ?
if ($line_count) {
for(; $line_count > 0; $line_count--) {
if ($body->[-$line_count] =~ /[^\s.]/) {
$self->{mime_epilogue_exists} = 1;
last;
}
}
}
}