in sdk/serialization/azure-xml/src/main/java/com/azure/xml/implementation/aalto/util/CharsetNames.java [53:227]
public static String normalize(String csName) {
if (csName == null || csName.length() < 3) {
return csName;
}
/* Canonical charset names here are from IANA recommendation:
* http://www.iana.org/assignments/character-sets
* but comparison is done loosely (case-insensitive, ignoring
* spacing, underscore vs. hyphen etc) to try to make detection
* as extensive as possible.
*/
/* But first bit of pre-filtering: it seems like 'cs' prefix
* is applicable to pretty much all actual encodings (as per
* IANA recommendations; csASCII, csUcs4 etc). So, let's just
* strip out the prefix if so
*/
boolean gotCsPrefix = false;
char c = csName.charAt(0);
if (c == 'c' || c == 'C') {
char d = csName.charAt(1);
if (d == 's' || d == 'S') {
csName = csName.substring(2);
c = csName.charAt(0);
gotCsPrefix = true;
}
}
switch (c) {
case 'a':
case 'A':
if (csName.equals("ASCII") || equalEncodings(csName, "ASCII")) {
return CS_US_ASCII;
}
break;
case 'c':
case 'C':
// Hmmh. There are boatloads of these... but what to do with them?
encodingStartsWith(csName, "cs");// !!! TBI
break;
case 'e':
case 'E':
if (csName.startsWith("EBCDIC") || csName.startsWith("ebcdic")) {
return CS_EBCDIC;
}
break;
case 'i':
case 'I':
if (csName.equals(CS_ISO_LATIN1)
|| equalEncodings(csName, CS_ISO_LATIN1)
|| equalEncodings(csName, "ISO-Latin1")) {
return CS_ISO_LATIN1;
}
if (encodingStartsWith(csName, "ISO-10646")) {
/* Hmmh. There are boatloads of alternatives here, it
* seems (see http://www.iana.org/assignments/character-sets
* for details)
*/
int ix = csName.indexOf("10646");
String suffix = csName.substring(ix + 5);
if (equalEncodings(suffix, "UCS-Basic")) {
return CS_US_ASCII;
}
if (equalEncodings(suffix, "Unicode-Latin1")) {
return CS_ISO_LATIN1;
}
if (equalEncodings(suffix, "UCS-2")) {
return CS_UTF16; // endianness?
}
if (equalEncodings(suffix, "UCS-4")) {
return CS_UTF32; // endianness?
}
if (equalEncodings(suffix, "UTF-1")) {
// "Universal Transfer Format (1), this is the multibyte encoding, that subsets ASCII-7"???
return CS_US_ASCII;
}
if (equalEncodings(suffix, "J-1")) {
// Name: ISO-10646-J-1, Source: ISO 10646 Japanese, see RFC 1815.
// ... so what does that really mean? let's consider it ascii
return CS_US_ASCII;
}
if (equalEncodings(suffix, "US-ASCII")) {
return CS_US_ASCII;
}
}
break;
case 'j':
case 'J':
if (equalEncodings(csName, "JIS_Encoding")) {
return CS_SHIFT_JIS;
}
break;
case 's':
case 'S':
if (equalEncodings(csName, "Shift_JIS")) {
return CS_SHIFT_JIS;
}
break;
case 'u':
case 'U':
if (csName.length() < 2) { // sanity check
break;
}
switch (csName.charAt(1)) {
case 'c':
case 'C':
if (equalEncodings(csName, "UCS-2")) {
return CS_UTF16;
}
if (equalEncodings(csName, "UCS-4")) {
return CS_UTF32;
}
break;
case 'n': // csUnicodeXxx,
case 'N':
if (gotCsPrefix) {
if (equalEncodings(csName, "Unicode")) {
return CS_UTF16; // need BOM
}
if (equalEncodings(csName, "UnicodeAscii")) {
return CS_ISO_LATIN1;
}
if (equalEncodings(csName, "UnicodeAscii")) {
return CS_US_ASCII;
}
}
break;
case 's':
case 'S':
if (equalEncodings(csName, "US-ASCII")) {
return CS_US_ASCII;
}
break;
case 't':
case 'T':
if (csName.equals(CS_UTF8) || equalEncodings(csName, CS_UTF8)) {
return CS_UTF8;
}
if (equalEncodings(csName, "UTF-16BE")) {
return CS_UTF16BE;
}
if (equalEncodings(csName, "UTF-16LE")) {
return CS_UTF16LE;
}
if (equalEncodings(csName, "UTF-16")) {
return CS_UTF16;
}
if (equalEncodings(csName, "UTF-32BE")) {
return CS_UTF32BE;
}
if (equalEncodings(csName, "UTF-32LE")) {
return CS_UTF32LE;
}
if (equalEncodings(csName, "UTF-32")) {
return CS_UTF32;
}
if (equalEncodings(csName, "UTF")) {
// 21-Jan-2006, TSa: ??? What is this to do... ?
return CS_UTF16;
}
}
break;
}
return csName;
}