in lib/Mail/SpamAssassin/Message/Node.pm [453:687]
sub _normalize {
# my $data = $_[0]; # avoid copying large strings
my $charset_declared = $_[1];
my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
my $insist_on_declared_charset = $_[3]; # no FB_CROAK in Encode::decode
warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
if utf8::is_utf8($_[0]);
# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
# number of characters with code above 127
my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
if (!$cnt_8bits &&
$charset_declared =~
/^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
ISO646-US )\z/xsi)
{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
dbg("message: contains only US-ASCII characters, declared %s, not decoding",
$charset_declared);
return $_[0]; # is all-ASCII, no need for decoding
}
if (!$cnt_8bits &&
$charset_declared =~
/^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
dbg("message: contains only US-ASCII characters, declared %s, not decoding",
$charset_declared);
return $_[0]; # is all-ASCII, no need for decoding
}
# Try first to strictly decode based on a declared character set.
my $rv;
# Try first as UTF-8 ignoring declaring?
my $tried_utf8;
if ($cnt_8bits && !$insist_on_declared_charset) {
if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
dbg("message: decoded as charset UTF-8, declared %s",
$charset_declared);
return $_[0] if !$return_decoded;
$rv .= $data_taint; # carry taintedness over, avoid Encode bug
return $rv; # decoded
} else {
my $err = '';
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: failed decoding as charset UTF-8, declared %s%s",
$charset_declared, $err);
$tried_utf8 = 1;
}
}
if ($charset_declared =~ /^(?:US-)?ASCII\z/i
&& !$insist_on_declared_charset) {
# declared as US-ASCII but contains 8-bit characters, makes no sense
# to attempt decoding first as strict US-ASCII as we know it would fail
} elsif ($charset_declared =~ /^UTF[ -]?16/i) {
# Handle cases where spammers use UTF-16 encoding without including a BOM
# or declaring endianness as reported at:
# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
my $decoder = detect_utf16( $_[0] );
if (defined $decoder) {
if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
dbg("message: decoded as charset %s, declared %s",
$decoder->name, $charset_declared);
return $_[0] if !$return_decoded;
$rv .= $data_taint; # carry taintedness over, avoid Encode bug
return $rv; # decoded
} else {
my $err = '';
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: failed decoding as charset %s, declared %s%s",
$decoder->name, $charset_declared, $err);
}
};
} else {
# try decoding as a declared character set
# -> http://en.wikipedia.org/wiki/Windows-1252
# Windows-1252 character encoding is a superset of ISO 8859-1, but differs
# from the IANA's ISO-8859-1 by using displayable characters rather than
# control characters in the 80 to 9F (hex) range. [...]
# It is very common to mislabel Windows-1252 text with the charset label
# ISO-8859-1. A common result was that all the quotes and apostrophes
# (produced by "smart quotes" in word-processing software) were replaced
# with question marks or boxes on non-Windows operating systems, making
# text difficult to read. Most modern web browsers and e-mail clients
# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
# such mislabeling. This is now standard behavior in the draft HTML 5
# specification, which requires that documents advertised as ISO-8859-1
# actually be parsed with the Windows-1252 encoding.
#
my($chset, $decoder);
if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
$chset = 'Windows-1252'; $decoder = $enc_w1252;
} elsif ($charset_declared =~ /^UTF-?8\z/i) {
$chset = 'UTF-8'; $decoder = $enc_utf8;
} else {
$chset = $charset_declared;
$decoder = Encode::find_encoding($chset);
if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
dbg("message: no decoder for a declared charset %s, using GBK",
$chset) if $decoder;
}
}
if (!$decoder) {
dbg("message: failed decoding, no decoder for a declared charset %s",
$chset);
}
elsif ($tried_utf8 && $chset eq 'UTF-8') {
# was already tried initially, no point doing again
}
else {
my $check_flags = Encode::LEAVE_SRC; # 0x0008
$check_flags |= Encode::FB_CROAK unless $insist_on_declared_charset;
my $err = '';
if (eval { $rv = $decoder->decode($_[0], $check_flags); defined $rv }) {
dbg("message: decoded as charset %s, declared %s",
$decoder->name, $charset_declared);
} else {
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: failed decoding as charset %s, declared %s%s",
$decoder->name, $charset_declared, $err);
}
}
}
# If the above failed, check if it is US-ASCII, possibly extended by few
# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
# some popular punctuation or special characters from Windows-1252 in
# the \x80-\x9F range (which is unassigned in ISO-8859-*).
# Note that Windows-1252 is a proper superset of ISO-8859-1.
#
if (!defined $rv && !$cnt_8bits) {
dbg("message: contains only US-ASCII characters, declared %s, not decoding",
$charset_declared);
return $_[0]; # is all-ASCII, no need for decoding
} elsif (!defined $rv && $enc_w1252 &&
# ASCII NBSP (c) SHY ' " ... '".- TM
#$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
# Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection,
# Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!)
$_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c)
{ # ASCII + NBSP + SHY + some punctuation characters
# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
# consider also: AE (r), 80 Euro
my $err = '';
eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
# the above can't fail, but keep code general just in case
dbg("message: %s as guessed charset %s, declared %s%s",
defined $rv ? 'decoded' : 'failed decoding',
'Windows-1252', $charset_declared, $err);
}
# If we were unsuccessful so far, try some guesswork
# based on Encode::Detect::Detector .
if (defined $rv) {
# done, no need for guesswork
} elsif (!$have_encode_detector) {
dbg("message: Encode::Detect::Detector not available, declared %s failed",
$charset_declared);
} else {
my $charset_detected = Encode::Detect::Detector::detect($_[0]);
if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
my $decoder = Encode::find_encoding($charset_detected);
if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
dbg("message: no decoder for a detected charset %s, using GBK",
$charset_detected) if $decoder;
}
if (!$decoder) {
dbg("message: failed decoding, no decoder for a detected charset %s",
$charset_detected);
} else {
my $err = '';
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: %s as detected charset %s, declared %s%s",
defined $rv ? 'decoded' : 'failed decoding',
$charset_detected, $charset_declared, $err);
}
}
}
if (!defined $rv) { # all decoding attempts failed so far, probably garbage
# go for Windows-1252 which can't fail
my $err = '';
eval { $rv = $enc_w1252->decode($_[0]) };
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
}
dbg("message: %s as last-resort charset %s, declared %s%s",
defined $rv ? 'decoded' : 'failed decoding',
'Windows-1252', $charset_declared, $err);
}
if (!defined $rv) { # just in case - all decoding attempts failed so far
return $_[0]; # garbage-in / garbage-out, return unchanged octets
}
# decoding octets to characters was successful
if (!$return_decoded) {
# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
utf8::encode($rv); # encode Unicode characters to UTF-8 octets
}
$rv .= $data_taint; # carry taintedness over, avoid Encode bug
return $rv;
}