sub _normalize()

in lib/Mail/SpamAssassin/Message/Node.pm [453:687]


sub _normalize {
# my $data = $_[0];  # avoid copying large strings
  my $charset_declared = $_[1];
  my $return_decoded = $_[2];  # true: Unicode characters, false: UTF-8 octets
  my $insist_on_declared_charset = $_[3];  # no FB_CROAK in Encode::decode

  warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
    if utf8::is_utf8($_[0]);

  # workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
  my $data_taint = substr($_[0], 0, 0);  # empty string, tainted like $data

  # number of characters with code above 127
  my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;

  if (!$cnt_8bits &&
      $charset_declared =~
        /^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
              ISO646-US )\z/xsi)
  { # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
    dbg("message: contains only US-ASCII characters, declared %s, not decoding",
      $charset_declared);
    return $_[0];  # is all-ASCII, no need for decoding
  }

  if (!$cnt_8bits &&
      $charset_declared =~
        /^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
              UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
              Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
  { # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
    dbg("message: contains only US-ASCII characters, declared %s, not decoding",
      $charset_declared);
    return $_[0];  # is all-ASCII, no need for decoding
  }

  # Try first to strictly decode based on a declared character set.

  my $rv;

  # Try first as UTF-8 ignoring declaring?
  my $tried_utf8;
  if ($cnt_8bits && !$insist_on_declared_charset) {
    if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
      dbg("message: decoded as charset UTF-8, declared %s",
        $charset_declared);
      return $_[0]  if !$return_decoded;
      $rv .= $data_taint;  # carry taintedness over, avoid Encode bug
      return $rv;  # decoded
    } else {
      my $err = '';
      if ($@) {
        $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
        $err = " ($err)";
      }
      dbg("message: failed decoding as charset UTF-8, declared %s%s",
        $charset_declared, $err);
      $tried_utf8 = 1;
    }
  }

  if ($charset_declared =~ /^(?:US-)?ASCII\z/i
           && !$insist_on_declared_charset) {
    # declared as US-ASCII but contains 8-bit characters, makes no sense
    # to attempt decoding first as strict US-ASCII as we know it would fail

  } elsif ($charset_declared =~ /^UTF[ -]?16/i) {
    # Handle cases where spammers use UTF-16 encoding without including a BOM
    # or declaring endianness as reported at:
    # https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252

    my $decoder = detect_utf16( $_[0] );
    if (defined $decoder) {
      if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
        dbg("message: decoded as charset %s, declared %s",
          $decoder->name, $charset_declared);
        return $_[0]  if !$return_decoded;
        $rv .= $data_taint;  # carry taintedness over, avoid Encode bug
        return $rv;  # decoded
      } else {
        my $err = '';
        if ($@) {
          $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
          $err = " ($err)";
        }
        dbg("message: failed decoding as charset %s, declared %s%s",
          $decoder->name, $charset_declared, $err);
      }
    };
  } else {
    # try decoding as a declared character set

    # ->  http://en.wikipedia.org/wiki/Windows-1252
    # Windows-1252 character encoding is a superset of ISO 8859-1, but differs
    # from the IANA's ISO-8859-1 by using displayable characters rather than
    # control characters in the 80 to 9F (hex) range. [...]
    # It is very common to mislabel Windows-1252 text with the charset label
    # ISO-8859-1. A common result was that all the quotes and apostrophes
    # (produced by "smart quotes" in word-processing software) were replaced
    # with question marks or boxes on non-Windows operating systems, making
    # text difficult to read. Most modern web browsers and e-mail clients
    # treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
    # such mislabeling. This is now standard behavior in the draft HTML 5
    # specification, which requires that documents advertised as ISO-8859-1
    # actually be parsed with the Windows-1252 encoding.
    #
    my($chset, $decoder);
    if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
      $chset = 'Windows-1252'; $decoder = $enc_w1252;
    } elsif ($charset_declared =~ /^UTF-?8\z/i) {
      $chset = 'UTF-8'; $decoder = $enc_utf8;
    } else {
      $chset = $charset_declared;
      $decoder = Encode::find_encoding($chset);
      if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
        $decoder = Encode::find_encoding('GBK');  # a subset of GB18030
        dbg("message: no decoder for a declared charset %s, using GBK",
            $chset)  if $decoder;
      }
    }
    if (!$decoder) {
      dbg("message: failed decoding, no decoder for a declared charset %s",
          $chset);
    }
    elsif ($tried_utf8 && $chset eq 'UTF-8') {
      # was already tried initially, no point doing again
    }
    else {
      my $check_flags = Encode::LEAVE_SRC;  # 0x0008
      $check_flags |= Encode::FB_CROAK  unless $insist_on_declared_charset;
      my $err = '';
      if (eval { $rv = $decoder->decode($_[0], $check_flags); defined $rv }) {
        dbg("message: decoded as charset %s, declared %s",
          $decoder->name, $charset_declared);
      } else {
        if ($@) {
          $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
          $err = " ($err)";
        }
        dbg("message: failed decoding as charset %s, declared %s%s",
          $decoder->name, $charset_declared, $err);
      }
    }
  }

  # If the above failed, check if it is US-ASCII, possibly extended by few
  # NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
  # some popular punctuation or special characters from Windows-1252 in
  # the \x80-\x9F range (which is unassigned in ISO-8859-*).
  # Note that Windows-1252 is a proper superset of ISO-8859-1.
  #
  if (!defined $rv && !$cnt_8bits) {
    dbg("message: contains only US-ASCII characters, declared %s, not decoding",
        $charset_declared);
    return $_[0];  # is all-ASCII, no need for decoding

  } elsif (!defined $rv && $enc_w1252 &&
     #             ASCII  NBSP (c) SHY  '   "  ...   '".-   TM
     #$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
     # Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection,
     # Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!)
      $_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c)
  { # ASCII + NBSP + SHY + some punctuation characters
    # NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
    # consider also: AE (r), 80 Euro
    my $err = '';
    eval { $rv = $enc_w1252->decode($_[0], 1|8) };  # FB_CROAK | LEAVE_SRC
    if ($@) {
      $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
      $err = " ($err)";
    }
    # the above can't fail, but keep code general just in case
    dbg("message: %s as guessed charset %s, declared %s%s",
        defined $rv ? 'decoded' : 'failed decoding',
        'Windows-1252', $charset_declared, $err);
  }

  # If we were unsuccessful so far, try some guesswork
  # based on Encode::Detect::Detector .

  if (defined $rv) {
    # done, no need for guesswork
  } elsif (!$have_encode_detector) {
    dbg("message: Encode::Detect::Detector not available, declared %s failed",
        $charset_declared);
  } else {
    my $charset_detected = Encode::Detect::Detector::detect($_[0]);
    if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
      my $decoder = Encode::find_encoding($charset_detected);
      if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
        $decoder = Encode::find_encoding('GBK');  # a subset of GB18030
        dbg("message: no decoder for a detected charset %s, using GBK",
            $charset_detected)  if $decoder;
      }
      if (!$decoder) {
        dbg("message: failed decoding, no decoder for a detected charset %s",
            $charset_detected);
      } else {
        my $err = '';
        eval { $rv = $decoder->decode($_[0], 1|8) };  # FB_CROAK | LEAVE_SRC
        if ($@) {
          $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
          $err = " ($err)";
        }
        dbg("message: %s as detected charset %s, declared %s%s",
            defined $rv ? 'decoded' : 'failed decoding',
            $charset_detected, $charset_declared, $err);
      }
    }
  }

  if (!defined $rv) {  # all decoding attempts failed so far, probably garbage
    # go for Windows-1252 which can't fail
    my $err = '';
    eval { $rv = $enc_w1252->decode($_[0]) };
    if ($@) {
      $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
      $err = " ($err)";
    }
    dbg("message: %s as last-resort charset %s, declared %s%s",
        defined $rv ? 'decoded' : 'failed decoding',
        'Windows-1252', $charset_declared, $err);
  }

  if (!defined $rv) {  # just in case - all decoding attempts failed so far
    return $_[0];  # garbage-in / garbage-out, return unchanged octets
  }
  # decoding octets to characters was successful
  if (!$return_decoded) {
    # utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
    utf8::encode($rv);  # encode Unicode characters to UTF-8 octets
  }
  $rv .= $data_taint;  # carry taintedness over, avoid Encode bug
  return $rv;
}