sub _get_pdf_details()

in lib/Mail/SpamAssassin/Plugin/PDFInfo.pm [261:480]


sub _get_pdf_details {
  my ($pms, $part) = @_;

  my $data = $part->decode();

  # Remove UTF-8 BOM
  $data =~ s/^\xef\xbb\xbf//;

  # Search magic in first 1024 bytes
  if ($data !~ /^.{0,1024}\%PDF\-(\d\.\d)/s) {
    dbg("pdfinfo: PDF magic header not found, invalid file?");
    return;
  }
  my $version = $1;
  _set_tag($pms, 'PDFVERSION', $version);
  # dbg("pdfinfo: pdf version = $version");

  my ($fuzzy_data, $pdf_tags);
  my ($md5, $fuzzy_md5) = ('','');
  my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);

  my $name = $part->{name} || '';
  _set_tag($pms, 'PDFNAME', $name);
  # store the file name so we can check pdf_named() or pdf_name_match() later.
  $pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;

  my $no_more_fuzzy = 0;
  my $got_image = 0;
  my $encrypted = 0;
  my $has_form = 0;
  my $has_script = 0;
  my $has_auto_script = 0;
  my %uris;

  while ($data =~ /([^\n]+)/g) {
    # dbg("pdfinfo: line=$1");
    my $line = $1;

    if (!$no_more_fuzzy && ++$line_count < 70) {
      if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
        $line =~ s/\s+$//;  # strip off whitespace at end.
        $fuzzy_data .= $line;
      }
      # once we hit the first stream, we stop collecting data for fuzzy md5
      $no_more_fuzzy = 1  if index($line, 'stream') >= 0;
    }

    $got_image = 1  if index($line, '/Image') >= 0;
    if (!$encrypted && index($line, '/Encrypt') == 0) {
      # store encrypted flag.
      $encrypted = $pms->{pdfinfo}->{encrypted} = 1;
    }

    # Detect if the PDF file has an embedded form
    if (!$has_form && index($line, '/AcroForm') == 0) {
      # PDF has a Form.
      $has_form = $pms->{pdfinfo}->{has_form} = 1;
    }

    # Detect if the PDF file has Javascript code that can optionally be started automatically
    if (!$has_script && index($line, '/JS') == 0) {
      # PDF has Javascript code.
      $has_script = $pms->{pdfinfo}->{has_script} = 1;
    }
    if (!$has_auto_script && index($line, '/AA') == 0) {
      $has_auto_script++;
    } elsif (!$has_auto_script && index($line, '/OpenAction') == 0) {
      $has_auto_script++;
    }
    if($has_auto_script and $has_script) {
      $pms->{pdfinfo}->{has_auto_script} = 1;
    }

    # From a v1.3 pdf
    # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm
    # [12234] dbg: pdfinfo: line=/Width 630
    # [12234] dbg: pdfinfo: line=/Height 149
    if ($got_image) {
      my ($width, $height);
      if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) {
        $width = $1;
        $height = $2;
      }
      elsif ($line =~ /^\/Width\s(\d+)/) {
        $width = $1;
      }
      elsif ($line =~ /^\/Height\s(\d+)/) {
        $height = $1;
      }
      elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) {
        $width = $1;
        $height = $2;
      }
      if ($width && $height) {
        $no_more_fuzzy = 1;
        my $area = $width * $height;
        $total_height += $height;
        $total_width += $width;
        $total_area += $area;
        $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
        $pms->{pdfinfo}->{count_pdf_images}++;
        dbg("pdfinfo: Found image in PDF $name: $height x $width pixels ($area pixels sq.)");
        _set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
        $got_image = $height = $width = 0;  # reset and check for next image
      }
    }

    #
    # Triage - expecting / to be found for rest of the checks
    #
    next unless index($line, '/') >= 0;

    if ($line =~ m/^\/([A-Za-z]+)/) {
      $pdf_tags .= $1;
    }

    # XXX some pdf have uris but are stored inside binary data
    if (keys %uris < 20 && $line =~ /(?:\/S\s{0,2}\/URI\s{0,2}|^\s*)\/URI\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )|\((https?:\/\/.{8,256})\)>>/x) {
      my $location;
      if (defined $1 and (index($1, '.') > 0)) {
        $location = _parse_string($1);
      }
      if (not defined($location) or index($location, '.') <= 0) {
	if(defined $2) {
          $location = _parse_string($2);
        } elsif(defined $1) {
          $location = $1;
          local $1;
          $location =~ s/\\([0-3]?[0-7]{1,2})/chr(oct($1))/ge;
          $location = _parse_string($location);
        } else {
	  next;
	}
      }
      next unless index($location, '.') > 0; # ignore some binary mess
      next if $location =~ /\0/; # ignore urls with NUL characters
      if (!exists $uris{$location}) {
        $uris{$location} = 1;
        dbg("pdfinfo: found URI: $location");
        $pms->{pdfinfo}->{count_pdf_uris}++;
        $pms->add_uri_detail_list($location);
      }
    }

    # [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
    # [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220)
    # [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220)
    # [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1)
    # [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2)
    # [5310] dbg: pdfinfo: line=/Author(colet)>>endobj
    # or all on same line inside xml - v1.6+
    # <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>>
    # Or hex values
    # /Creator<FEFF005700720069007400650072>
    if ($line =~ /\/Author\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
      my $author = _parse_string($1);
      dbg("pdfinfo: found property Author=$author");
      $pms->{pdfinfo}->{details}->{author}->{$author} = 1;
      _set_tag($pms, 'PDFAUTHOR', $author);
    }
    if ($line =~ /\/Creator\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
      my $creator = _parse_string($1);
      dbg("pdfinfo: found property Creator=$creator");
      $pms->{pdfinfo}->{details}->{creator}->{$creator} = 1;
      _set_tag($pms, 'PDFCREATOR', $creator);
    }
    if ($line =~ /\/CreationDate\s{0,2}\(D\:(\d+)/) {
      my $created = _parse_string($1);
      dbg("pdfinfo: found property Created=$created");
      $pms->{pdfinfo}->{details}->{created}->{$created} = 1;
    }
    if ($line =~ /\/ModDate\s{0,2}\(D\:(\d+)/) {
      my $modified = _parse_string($1);
      dbg("pdfinfo: found property Modified=$modified");
      $pms->{pdfinfo}->{details}->{modified}->{$modified} = 1;
    }
    if ($line =~ /\/Producer\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
      my $producer = _parse_string($1);
      dbg("pdfinfo: found property Producer=$producer");
      $pms->{pdfinfo}->{details}->{producer}->{$producer} = 1;
      _set_tag($pms, 'PDFPRODUCER', $producer);
    }
    if ($line =~ /\/Title\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
      my $title = _parse_string($1);
      dbg("pdfinfo: found property Title=$title");
      $pms->{pdfinfo}->{details}->{title}->{$title} = 1;
      _set_tag($pms, 'PDFTITLE', $title);
    }
  }

  # if we had multiple images in the pdf, we need to store the total HxW as well.
  # If it was a single Image PDF, then this value will already be in the hash.
  $pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);

  if ($total_area) {
    $pms->{pdfinfo}->{pc_pdf} = $total_area;
    _set_tag($pms, 'PDFIMGAREA', $total_area);
    dbg("pdfinfo: Total HxW: $total_height x $total_width ($total_area area)");
  }

  $md5 = uc(md5_hex($data)) if $data;
  $fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data;
  my $tags_md5 = '';
  $tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags;

  dbg("pdfinfo: MD5 results for $name: md5=$md5 fuzzy1=$fuzzy_md5 fuzzy2=$tags_md5");

  if ($md5) {
    $pms->{pdfinfo}->{md5}->{$md5} = 1;
    _set_tag($pms, 'PDFMD5', $fuzzy_md5);
  }
  if ($fuzzy_md5) {
    $pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1;
    _set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5);
  }
  if ($tags_md5) {
    $pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1;
    _set_tag($pms, 'PDFMD5FUZZY2', $tags_md5);
  }
}