public static String normalize()

in sdk/serialization/azure-xml/src/main/java/com/azure/xml/implementation/aalto/util/CharsetNames.java [53:227]


    public static String normalize(String csName) {
        if (csName == null || csName.length() < 3) {
            return csName;
        }

        /* Canonical charset names here are from IANA recommendation:
         *   http://www.iana.org/assignments/character-sets
         * but comparison is done loosely (case-insensitive, ignoring
         * spacing, underscore vs. hyphen etc) to try to make detection
         * as extensive as possible.
         */

        /* But first bit of pre-filtering: it seems like 'cs' prefix
         * is applicable to pretty much all actual encodings (as per
         * IANA recommendations; csASCII, csUcs4 etc). So, let's just
         * strip out the prefix if so
         */
        boolean gotCsPrefix = false;
        char c = csName.charAt(0);
        if (c == 'c' || c == 'C') {
            char d = csName.charAt(1);
            if (d == 's' || d == 'S') {
                csName = csName.substring(2);
                c = csName.charAt(0);
                gotCsPrefix = true;
            }
        }

        switch (c) {
            case 'a':
            case 'A':
                if (csName.equals("ASCII") || equalEncodings(csName, "ASCII")) {
                    return CS_US_ASCII;
                }
                break;

            case 'c':
            case 'C':
                // Hmmh. There are boatloads of these... but what to do with them?
                encodingStartsWith(csName, "cs");// !!! TBI
                break;

            case 'e':
            case 'E':
                if (csName.startsWith("EBCDIC") || csName.startsWith("ebcdic")) {
                    return CS_EBCDIC;
                }
                break;

            case 'i':
            case 'I':
                if (csName.equals(CS_ISO_LATIN1)
                    || equalEncodings(csName, CS_ISO_LATIN1)
                    || equalEncodings(csName, "ISO-Latin1")) {
                    return CS_ISO_LATIN1;
                }
                if (encodingStartsWith(csName, "ISO-10646")) {
                    /* Hmmh. There are boatloads of alternatives here, it
                     * seems (see http://www.iana.org/assignments/character-sets
                     * for details)
                     */
                    int ix = csName.indexOf("10646");
                    String suffix = csName.substring(ix + 5);
                    if (equalEncodings(suffix, "UCS-Basic")) {
                        return CS_US_ASCII;
                    }
                    if (equalEncodings(suffix, "Unicode-Latin1")) {
                        return CS_ISO_LATIN1;
                    }
                    if (equalEncodings(suffix, "UCS-2")) {
                        return CS_UTF16; // endianness?
                    }
                    if (equalEncodings(suffix, "UCS-4")) {
                        return CS_UTF32; // endianness?
                    }
                    if (equalEncodings(suffix, "UTF-1")) {
                        // "Universal Transfer Format (1), this is the multibyte encoding, that subsets ASCII-7"???
                        return CS_US_ASCII;
                    }
                    if (equalEncodings(suffix, "J-1")) {
                        // Name: ISO-10646-J-1, Source: ISO 10646 Japanese, see RFC 1815.
                        // ... so what does that really mean? let's consider it ascii
                        return CS_US_ASCII;
                    }
                    if (equalEncodings(suffix, "US-ASCII")) {
                        return CS_US_ASCII;
                    }
                }
                break;

            case 'j':
            case 'J':
                if (equalEncodings(csName, "JIS_Encoding")) {
                    return CS_SHIFT_JIS;
                }
                break;

            case 's':
            case 'S':
                if (equalEncodings(csName, "Shift_JIS")) {
                    return CS_SHIFT_JIS;
                }
                break;

            case 'u':
            case 'U':
                if (csName.length() < 2) { // sanity check
                    break;
                }
                switch (csName.charAt(1)) {
                    case 'c':
                    case 'C':
                        if (equalEncodings(csName, "UCS-2")) {
                            return CS_UTF16;
                        }
                        if (equalEncodings(csName, "UCS-4")) {
                            return CS_UTF32;
                        }
                        break;

                    case 'n': // csUnicodeXxx,
                    case 'N':
                        if (gotCsPrefix) {
                            if (equalEncodings(csName, "Unicode")) {
                                return CS_UTF16; // need BOM
                            }
                            if (equalEncodings(csName, "UnicodeAscii")) {
                                return CS_ISO_LATIN1;
                            }
                            if (equalEncodings(csName, "UnicodeAscii")) {
                                return CS_US_ASCII;
                            }
                        }
                        break;

                    case 's':
                    case 'S':
                        if (equalEncodings(csName, "US-ASCII")) {
                            return CS_US_ASCII;
                        }
                        break;

                    case 't':
                    case 'T':
                        if (csName.equals(CS_UTF8) || equalEncodings(csName, CS_UTF8)) {
                            return CS_UTF8;
                        }
                        if (equalEncodings(csName, "UTF-16BE")) {
                            return CS_UTF16BE;
                        }
                        if (equalEncodings(csName, "UTF-16LE")) {
                            return CS_UTF16LE;
                        }
                        if (equalEncodings(csName, "UTF-16")) {
                            return CS_UTF16;
                        }
                        if (equalEncodings(csName, "UTF-32BE")) {
                            return CS_UTF32BE;
                        }
                        if (equalEncodings(csName, "UTF-32LE")) {
                            return CS_UTF32LE;
                        }
                        if (equalEncodings(csName, "UTF-32")) {
                            return CS_UTF32;
                        }
                        if (equalEncodings(csName, "UTF")) {
                            // 21-Jan-2006, TSa: ??? What is this to do... ?
                            return CS_UTF16;
                        }
                }
                break;
        }

        return csName;
    }