in lib/Mail/SpamAssassin/Message.pm [97:388]
sub new {
my $class = shift;
$class = ref($class) || $class;
my($opts) = @_;
my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
my $parsenow = $opts->{'parsenow'} || 0;
my $normalize = $opts->{'normalize'} || 0;
# Specifies whether or not to parse message/rfc822 parts into its own tree.
# If the # > 0, it'll subparse, otherwise it won't. By default, do twenty
# levels deep.
my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;
my $self = $class->SUPER::new({normalize=>$normalize});
$self->{tmpfiles} = [];
$self->{pristine_msg} = '';
$self->{pristine_headers} = '';
$self->{pristine_body} = '';
$self->{mime_boundary_state} = {};
$self->{line_ending} = "\012";
$self->{master_deadline} = $opts->{'master_deadline'};
$self->{suppl_attrib} = $opts->{'suppl_attrib'};
$self->{body_part_scan_size} = $opts->{'body_part_scan_size'} || 0;
$self->{rawbody_part_scan_size} = $opts->{'rawbody_part_scan_size'} || 0;
if ($self->{suppl_attrib}) { # caller-provided additional information
# pristine_body_length is currently used by an eval test check_body_length
# Possible To-Do: Base the length on the @message array later down?
if (defined $self->{suppl_attrib}{body_size}) {
# Optional info provided by a caller; should reflect the original
# message body size if provided, and as such it may differ from the
# $self->{pristine_body} size, e.g. when the caller passed a truncated
# message to SpamAssassin, or when counting line-endings differently.
$self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
dbg("message: set pristine_body_length from suppl_attrib: %s", $self->{pristine_body_length});
}
if (ref $self->{suppl_attrib}{mimepart_digests}) {
# Optional info provided by a caller: an array of digest codes (e.g. SHA1)
# of each MIME part. Should reflect the original message if provided.
# As such it may differ from digests calculated by get_mimepart_digests(),
# e.g. when the caller passed a truncated message to SpamAssassin.
$self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
dbg("message: set mimepart_digests from suppl_attrib");
}
}
bless($self,$class);
# create the metadata holder class
$self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);
# Ok, go ahead and do the message "parsing"
# protect it from abuse ...
local $_;
# Figure out how the message was passed to us, and deal with it.
my @message;
if (ref $message eq 'ARRAY') {
@message = @{$message};
}
elsif (ref($message) eq 'GLOB' || index(ref($message), 'IO::') == 0) {
if (defined fileno $message) {
# sysread+split avoids a Perl I/O bug (Bug 5985)
# and is faster than (<$message>) by 10..25 %
# (a drawback is a short-term double storage of a text in $raw_str)
#
my($nread,$raw_str); $raw_str = '';
while ( $nread=sysread($message, $raw_str, 16384, length $raw_str) ) { }
defined $nread or die "error reading: $!";
@message = split(/^/m, $raw_str, -1);
if ($raw_str eq '') {
dbg("message: empty message read");
} elsif (length($raw_str) > 128*1024) {
# ditch rarely used large chunks of allocated memory, Bug 6514
# http://www.perlmonks.org/?node_id=803515
# about 97% of mail messages are below 128 kB,
# about 98% of mail messages are below 256 kB (2010 statistics)
# dbg("message: deallocating %.2f MB", length($raw_str)/1024/1024);
undef $raw_str;
}
}
}
elsif (ref $message eq 'SCALAR') {
@message = split(/^/m, $$message, -1);
}
elsif (ref $message) {
dbg("message: Input is a reference of unknown type!");
}
elsif (defined $message) {
@message = split(/^/m, $message, -1);
}
# Deal with null message
if (!@message) {
# bug 4884:
# if we get here, it means that the input was null, so fake the message
# content as a single newline...
@message = ("\n");
}
# Bug 7648:
# Make sure the message is tainted. When linting, @testmsg is not, so this
# handles that. Perhaps 3rd party tools could call this with untainted
# messages? Tainting the message is important because it prevents certain
# exploits later.
if (Mail::SpamAssassin::Util::am_running_in_taint_mode() &&
grep { !tainted($_) } @message) {
local($_);
# To preserve newlines, no joining and splitting here, process each line
# directly as is.
foreach (@message) {
$_ = Mail::SpamAssassin::Util::taint_var($_);
}
if (grep { !tainted($_) } @message) {
die "Mail::SpamAssassin::Message failed to enforce message taintness";
}
}
# Pull off mbox and mbx separators
if ($message[0] =~ /^From\s+(?!:)/) {
# careful not to confuse with obsolete syntax which allowed WSP before ':'
# mbox formatted mailbox
$self->{'mbox_sep'} = shift @message;
} elsif ($message[0] =~ MBX_SEPARATOR) {
$_ = shift @message;
# Munge the mbx message separator into mbox format as a sort of
# de facto portability standard in SA's internals. We need to
# to this so that Mail::SpamAssassin::Util::parse_rfc822_date
# can parse the date string...
if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
# $1 = day of month
# $2 = month (text)
# $3 = year
# $4 = hour
# $5 = min
# $6 = sec
my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
my $address;
foreach (@message) {
if (/^From:[^<]*<([^>]+)>/) {
$address = $1;
last;
} elsif (/^From:\s*([^<> ]+)/) {
$address = $1;
last;
}
}
$self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
}
}
# bug 4363
# Check to see if we should do CRLF instead of just LF
# For now, just check the first and last line and do whatever it does
if (index($message[0], "\015\012") != -1 || index($message[-1], "\015\012") != -1) {
$self->{line_ending} = "\015\012";
dbg("message: line ending changed to CRLF");
}
# Is a CRLF -> LF line endings conversion necessary?
my $squash_crlf = $self->{line_ending} eq "\015\012";
# Go through all the header fields of the message
my $hdr_errors = 0;
my $header;
for (;;) {
# make sure not to lose the last header field when there is no body
my $eof = !@message;
my $current = $eof ? $self->{line_ending} : shift @message;
# Bug 7785: spamass-milter breaks wrapped headers, add any missing \r
if ($squash_crlf) {
$current =~ s/(?<!\015)\012/\015\012/gs;
}
if ( $current =~ /^[ \t]/ ) {
# This wasn't useful in terms of a rule, but we may want to treat it
# specially at some point. Perhaps ignore it?
#unless ($current =~ /\S/) {
# $self->{'obsolete_folding_whitespace'} = 1;
#}
$header = '' if !defined $header; # header starts with a continuation!?
$header .= $current; # append continuations, no matter what
$self->{'pristine_headers'} .= $current;
}
else { # not a continuation
# Ok, there's a header here, let's go ahead and add it in.
if (defined $header) { # deal with a previous header field
my ($key, $value) = split (/:/s, $header, 2);
# If it's not a valid header (aka: not in the form "foo:bar"), skip it.
if (defined $value) {
# CRLF -> LF line-endings conversion if necessary
$value =~ s/\015\012/\012/sg if $squash_crlf;
$key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn
# limit the length of the pairs we store
if (length($key) > MAX_HEADER_KEY_LENGTH) {
$key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
$self->{'truncated_header'} = 1;
}
if (length($value) > MAX_HEADER_VALUE_LENGTH) {
$value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
$self->{'truncated_header'} = 1;
}
$self->header($key, $value);
}
}
if ($current eq $self->{line_ending}) { # a regular end of a header section
if ($eof) {
$self->{'missing_head_body_separator'} = 1;
} else {
$self->{'pristine_headers'} .= $current;
}
last;
}
elsif ($current =~ /^--/) { # mime boundary encountered, bail out
$self->{'missing_head_body_separator'} = 1;
unshift(@message, $current);
last;
}
# should we assume entering a body on encountering invalid header field?
else {
# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
if ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
# A field name MUST be composed of printable US-ASCII characters
# (i.e., characters that have values between 33 (041) and 126 (176),
# inclusive), except colon (072). Obsolete header field syntax
# allowed WSP before a colon.
if (++$hdr_errors <= 3) {
# just consume but ignore a few invalid header fields
} else { # enough is enough...
$self->{'missing_head_body_separator'} = 1;
unshift(@message, $current);
last;
}
}
}
# start collecting a new header field
$header = $current;
$self->{'pristine_headers'} .= $current;
}
}
undef $header;
# Store the pristine body for later -- store as a copy since @message
# will get modified below
$self->{'pristine_body'} = join('', @message);
if (!defined $self->{pristine_body_length}) {
$self->{'pristine_body_length'} = length $self->{'pristine_body'};
}
# Store complete message, get_pristine() is used a lot, avoid making copies
$self->{'pristine_msg'} = $self->{'pristine_headers'} . $self->{'pristine_body'};
# iterate over lines in reverse order
# merge multiple blank lines into a single one
my $start;
for (my $cnt=$#message; $cnt>=0; $cnt--) {
# CRLF -> LF line-endings conversion if necessary
$message[$cnt] =~ s/\015\012\z/\012/ if $squash_crlf;
# line is blank
if ($message[$cnt] =~ /^\s*$/) {
# /^\s*$/ is about 5% faster then !/\S/, but still expensive here
if (!defined $start) {
$start=$cnt;
}
next unless $cnt == 0;
}
# line is not blank, or we've reached the beginning
# if we've got a series of blank lines, get rid of them
if (defined $start) {
my $max_blank_lines = 20;
my $num = $start-$cnt;
if ($num > $max_blank_lines) {
splice @message, $cnt+2, $num-$max_blank_lines;
}
undef $start;
}
}