sub new()

in lib/Mail/SpamAssassin/Message.pm [97:388]
176 lines of code
48 McCabe index (conditional complexity)

sub new {
  my $class = shift;
  $class = ref($class) || $class;

  my($opts) = @_;
  my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
  my $parsenow = $opts->{'parsenow'} || 0;
  my $normalize = $opts->{'normalize'} || 0;

  # Specifies whether or not to parse message/rfc822 parts into its own tree.
  # If the # > 0, it'll subparse, otherwise it won't.  By default, do twenty
  # levels deep.
  my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;

  my $self = $class->SUPER::new({normalize=>$normalize});

  $self->{tmpfiles} =           [];
  $self->{pristine_msg} = 	'';
  $self->{pristine_headers} =	'';
  $self->{pristine_body} =	'';
  $self->{mime_boundary_state} = {};
  $self->{line_ending} =	"\012";
  $self->{master_deadline} = $opts->{'master_deadline'};
  $self->{suppl_attrib} = $opts->{'suppl_attrib'};
  $self->{body_part_scan_size} = $opts->{'body_part_scan_size'} || 0;
  $self->{rawbody_part_scan_size} = $opts->{'rawbody_part_scan_size'} || 0;

  if ($self->{suppl_attrib}) {  # caller-provided additional information
    # pristine_body_length is currently used by an eval test check_body_length
    # Possible To-Do: Base the length on the @message array later down?
    if (defined $self->{suppl_attrib}{body_size}) {
      # Optional info provided by a caller; should reflect the original
      # message body size if provided, and as such it may differ from the
      # $self->{pristine_body} size, e.g. when the caller passed a truncated
      # message to SpamAssassin, or when counting line-endings differently.
      $self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
      dbg("message: set pristine_body_length from suppl_attrib: %s", $self->{pristine_body_length});
    }
    if (ref $self->{suppl_attrib}{mimepart_digests}) {
      # Optional info provided by a caller: an array of digest codes (e.g. SHA1)
      # of each MIME part. Should reflect the original message if provided.
      # As such it may differ from digests calculated by get_mimepart_digests(),
      # e.g. when the caller passed a truncated message to SpamAssassin.
      $self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
      dbg("message: set mimepart_digests from suppl_attrib");
    }
  }

  bless($self,$class);

  # create the metadata holder class
  $self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);

  # Ok, go ahead and do the message "parsing"

  # protect it from abuse ...
  local $_;

  # Figure out how the message was passed to us, and deal with it.
  my @message;
  if (ref $message eq 'ARRAY') {
     @message = @{$message};
  }
  elsif (ref($message) eq 'GLOB' || index(ref($message), 'IO::') == 0) {
    if (defined fileno $message) {

      # sysread+split avoids a Perl I/O bug (Bug 5985)
      # and is faster than (<$message>) by 10..25 %
      # (a drawback is a short-term double storage of a text in $raw_str)
      #
      my($nread,$raw_str); $raw_str = '';
      while ( $nread=sysread($message, $raw_str, 16384, length $raw_str) ) { }
      defined $nread  or die "error reading: $!";
      @message = split(/^/m, $raw_str, -1);

      if ($raw_str eq '') {
        dbg("message: empty message read");
      } elsif (length($raw_str) > 128*1024) {
        # ditch rarely used large chunks of allocated memory, Bug 6514
        #   http://www.perlmonks.org/?node_id=803515
        # about 97% of mail messages are below 128 kB,
        # about 98% of mail messages are below 256 kB (2010 statistics)
        # dbg("message: deallocating %.2f MB", length($raw_str)/1024/1024);
        undef $raw_str;
      }
    }
  }
  elsif (ref $message eq 'SCALAR') {
    @message = split(/^/m, $$message, -1);
  }
  elsif (ref $message) {
    dbg("message: Input is a reference of unknown type!");
  }
  elsif (defined $message) {
    @message = split(/^/m, $message, -1);
  }

  # Deal with null message
  if (!@message) {
    # bug 4884:
    # if we get here, it means that the input was null, so fake the message
    # content as a single newline...
    @message = ("\n");
  }

  # Bug 7648:
  # Make sure the message is tainted. When linting, @testmsg is not, so this
  # handles that. Perhaps 3rd party tools could call this with untainted
  # messages? Tainting the message is important because it prevents certain
  # exploits later.
  if (Mail::SpamAssassin::Util::am_running_in_taint_mode() &&
        grep { !tainted($_) } @message) {
    local($_);
    # To preserve newlines, no joining and splitting here, process each line
    # directly as is.
    foreach (@message) {
      $_ = Mail::SpamAssassin::Util::taint_var($_);
    }
    if (grep { !tainted($_) } @message) {
      die "Mail::SpamAssassin::Message failed to enforce message taintness";
    }
  }

  # Pull off mbox and mbx separators
  if ($message[0] =~ /^From\s+(?!:)/) {
    # careful not to confuse with obsolete syntax which allowed WSP before ':'
    # mbox formatted mailbox
    $self->{'mbox_sep'} = shift @message;
  } elsif ($message[0] =~ MBX_SEPARATOR) {
    $_ = shift @message;

    # Munge the mbx message separator into mbox format as a sort of
    # de facto portability standard in SA's internals.  We need to
    # to this so that Mail::SpamAssassin::Util::parse_rfc822_date
    # can parse the date string...
    if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
      # $1 = day of month
      # $2 = month (text)
      # $3 = year
      # $4 = hour
      # $5 = min
      # $6 = sec
      my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
      my $address;
      foreach (@message) {
  	if (/^From:[^<]*<([^>]+)>/) {
  	    $address = $1;
  	    last;
  	} elsif (/^From:\s*([^<> ]+)/) {
  	    $address = $1;
  	    last;
  	}
      }
      $self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
    }
  }

  # bug 4363
  # Check to see if we should do CRLF instead of just LF
  # For now, just check the first and last line and do whatever it does
  if (index($message[0], "\015\012") != -1 || index($message[-1], "\015\012") != -1) {
    $self->{line_ending} = "\015\012";
    dbg("message: line ending changed to CRLF");
  }

  # Is a CRLF -> LF line endings conversion necessary?
  my $squash_crlf = $self->{line_ending} eq "\015\012";

  # Go through all the header fields of the message
  my $hdr_errors = 0;
  my $header;
  for (;;) {
    # make sure not to lose the last header field when there is no body
    my $eof = !@message;
    my $current = $eof ? $self->{line_ending} : shift @message;

    # Bug 7785: spamass-milter breaks wrapped headers, add any missing \r
    if ($squash_crlf) {
      $current =~ s/(?<!\015)\012/\015\012/gs;
    }

    if ( $current =~ /^[ \t]/ ) {
      # This wasn't useful in terms of a rule, but we may want to treat it
      # specially at some point.  Perhaps ignore it?
      #unless ($current =~ /\S/) {
      #  $self->{'obsolete_folding_whitespace'} = 1;
      #}

      $header = ''  if !defined $header;  # header starts with a continuation!?
      $header .= $current;  # append continuations, no matter what
      $self->{'pristine_headers'} .= $current;
    }
    else {  # not a continuation
      # Ok, there's a header here, let's go ahead and add it in.
      if (defined $header) {  # deal with a previous header field
        my ($key, $value) = split (/:/s, $header, 2);

        # If it's not a valid header (aka: not in the form "foo:bar"), skip it.
        if (defined $value) {
	  # CRLF -> LF line-endings conversion if necessary
	  $value =~ s/\015\012/\012/sg  if $squash_crlf;
	  $key =~ s/[ \t]+\z//;  # strip WSP before colon, obsolete rfc822 syn
	  # limit the length of the pairs we store
	  if (length($key) > MAX_HEADER_KEY_LENGTH) {
	    $key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
	    $self->{'truncated_header'} = 1;
	  }
	  if (length($value) > MAX_HEADER_VALUE_LENGTH) {
	    $value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
	    $self->{'truncated_header'} = 1;
	  }
          $self->header($key, $value);
        }
      }

      if ($current eq $self->{line_ending}) {  # a regular end of a header section
	if ($eof) {
	  $self->{'missing_head_body_separator'} = 1;
	} else {
	  $self->{'pristine_headers'} .= $current;
	}
	last;
      }
      elsif ($current =~ /^--/) {  # mime boundary encountered, bail out
	$self->{'missing_head_body_separator'} = 1;
	unshift(@message, $current);
 	last;
      }
      # should we assume entering a body on encountering invalid header field?
      else {
        # no re "strict";  # since perl 5.21.8: Ranges of ASCII printables...
        if ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
	  # A field name MUST be composed of printable US-ASCII characters
	  # (i.e., characters that have values between 33 (041) and 126 (176),
	  # inclusive), except colon (072). Obsolete header field syntax
	  # allowed WSP before a colon.
	  if (++$hdr_errors <= 3) {
	    # just consume but ignore a few invalid header fields
	  } else {  # enough is enough...
	    $self->{'missing_head_body_separator'} = 1;
	    unshift(@message, $current);
 	    last;
	  }
	}
      }

      # start collecting a new header field
      $header = $current;
      $self->{'pristine_headers'} .= $current;
    }
  }
  undef $header;

  # Store the pristine body for later -- store as a copy since @message
  # will get modified below
  $self->{'pristine_body'} = join('', @message);

  if (!defined $self->{pristine_body_length}) {
    $self->{'pristine_body_length'} = length $self->{'pristine_body'};
  }

  # Store complete message, get_pristine() is used a lot, avoid making copies
  $self->{'pristine_msg'} = $self->{'pristine_headers'} . $self->{'pristine_body'};

  # iterate over lines in reverse order
  # merge multiple blank lines into a single one
  my $start;
  for (my $cnt=$#message; $cnt>=0; $cnt--) {
    # CRLF -> LF line-endings conversion if necessary
    $message[$cnt] =~ s/\015\012\z/\012/  if $squash_crlf;

    # line is blank
    if ($message[$cnt] =~ /^\s*$/) {
      # /^\s*$/ is about 5% faster then !/\S/, but still expensive here
      if (!defined $start) {
        $start=$cnt;
      }
      next unless $cnt == 0;
    }

    # line is not blank, or we've reached the beginning

    # if we've got a series of blank lines, get rid of them
    if (defined $start) {
      my $max_blank_lines = 20;
      my $num = $start-$cnt;
      if ($num > $max_blank_lines) {
        splice @message, $cnt+2, $num-$max_blank_lines;
      }
      undef $start;
    }
  }