protected final PName addUTFPName()

in sdk/serialization/azure-xml/src/main/java/com/azure/xml/implementation/aalto/in/ByteBasedScanner.java [177:395]


    protected final PName addUTFPName(ByteBasedPNameTable symbols, XmlCharTypes charTypes, int hash, int[] quads,
        int qlen, int lastQuadBytes) throws XMLStreamException {
        // 4 bytes per quad, except last one maybe less
        int byteLen = (qlen << 2) - 4 + lastQuadBytes;

        // And last one is not correctly aligned (leading zero bytes instead
        // need to shift a bit, instead of trailing). Only need to shift it
        // for UTF-8 decoding; need revert for storage (since key will not
        // be aligned, to optimize lookup speed)
        int lastQuad;

        if (lastQuadBytes < 4) {
            lastQuad = quads[qlen - 1];
            // 8/16/24 bit left shift
            quads[qlen - 1] = (lastQuad << ((4 - lastQuadBytes) << 3));
        } else {
            lastQuad = 0;
        }

        // Let's handle first char separately (different validation):
        int ch = (quads[0] >>> 24);
        boolean ok;
        int ix = 1;
        char[] cbuf = _nameBuffer;
        int cix = 0;
        final int[] TYPES = charTypes.NAME_CHARS;

        switch (TYPES[ch]) {
            case XmlCharTypes.CT_NAME_NONE:
            case XmlCharTypes.CT_NAME_COLON: // not ok as first
            case XmlCharTypes.CT_NAME_NONFIRST:
            case InputCharTypes.CT_INPUT_NAME_MB_N:
                ok = false;
                break;

            case XmlCharTypes.CT_NAME_ANY:
                ok = true;
                break;

            default: // multi-byte (UTF-8) chars:
            {
                int needed;

                if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                    ch &= 0x1F;
                    needed = 1;
                } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                    ch &= 0x0F;
                    needed = 2;
                } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
                    ch &= 0x07;
                    needed = 3;
                } else { // 5- and 6-byte chars not valid xml chars
                    reportInvalidInitial(ch);
                    needed = ch = 1; // never really gets this far
                }
                if ((ix + needed) > byteLen) {
                    reportEofInName();
                }
                ix += needed;

                int q = quads[0];
                // Always need at least one more right away:
                int ch2 = (q >> 16) & 0xFF;
                if ((ch2 & 0xC0) != 0x080) {
                    reportInvalidOther(ch2);
                }
                ch = (ch << 6) | (ch2 & 0x3F);

                /* And then may need more. Note: here we do not do all the
                 * checks that UTF-8 text decoder might do. Reason is that
                 * name validity checking methods handle most of such checks
                 */
                if (needed > 1) {
                    ch2 = (q >> 8) & 0xFF;
                    if ((ch2 & 0xC0) != 0x080) {
                        reportInvalidOther(ch2);
                    }
                    ch = (ch << 6) | (ch2 & 0x3F);
                    if (needed > 2) { // 4 bytes? (need surrogates on output)
                        ch2 = q & 0xFF;
                        if ((ch2 & 0xC0) != 0x080) {
                            reportInvalidOther(ch2 & 0xFF);
                        }
                        ch = (ch << 6) | (ch2 & 0x3F);
                    }
                }
                ok = XmlChars.is10NameStartChar(ch);
                if (needed > 2) { // outside of basic 16-bit range? need surrogates
                    /* so, let's first output first char (high surrogate),
                     * let second be output by later code
                     */
                    ch -= 0x10000; // to normalize it starting with 0x0
                    cbuf[cix++] = (char) (0xD800 + (ch >> 10));
                    ch = (0xDC00 | (ch & 0x03FF));
                }
            }
        }

        if (!ok) { // 0 to indicate it's first char, even with surrogates
            reportInvalidNameChar(ch, 0);
        }

        cbuf[cix++] = (char) ch; // the only char, or second (low) surrogate

        /* Whoa! Tons of code for just the start char. But now we get to
         * decode the name proper, at last!
         */
        int last_colon = -1;

        while (ix < byteLen) {
            ch = quads[ix >> 2]; // current quad, need to shift+mask
            int byteIx = (ix & 3);
            ch = (ch >> ((3 - byteIx) << 3)) & 0xFF;
            ++ix;

            // Ascii?
            switch (TYPES[ch]) {
                case XmlCharTypes.CT_NAME_NONE:
                case XmlCharTypes.CT_MULTIBYTE_N:
                    ok = false;
                    break;

                case XmlCharTypes.CT_NAME_COLON: // not ok as first
                    if (last_colon >= 0) {
                        reportMultipleColonsInName();
                    }
                    last_colon = cix;
                    ok = true;
                    break;

                case XmlCharTypes.CT_NAME_NONFIRST:
                case XmlCharTypes.CT_NAME_ANY:
                    ok = true;
                    break;

                default: {
                    int needed;
                    if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                        ch &= 0x1F;
                        needed = 1;
                    } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                        ch &= 0x0F;
                        needed = 2;
                    } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
                        ch &= 0x07;
                        needed = 3;
                    } else { // 5- and 6-byte chars not valid xml chars
                        reportInvalidInitial(ch);
                        needed = ch = 1; // never really gets this far
                    }
                    if ((ix + needed) > byteLen) {
                        reportEofInName();
                    }

                    // Ok, always need at least one more:
                    int ch2 = quads[ix >> 2]; // current quad, need to shift+mask
                    byteIx = (ix & 3);
                    ch2 = (ch2 >> ((3 - byteIx) << 3));
                    ++ix;

                    if ((ch2 & 0xC0) != 0x080) {
                        reportInvalidOther(ch2);
                    }
                    ch = (ch << 6) | (ch2 & 0x3F);

                    // Once again, some of validation deferred to name char validator
                    if (needed > 1) {
                        ch2 = quads[ix >> 2];
                        byteIx = (ix & 3);
                        ch2 = (ch2 >> ((3 - byteIx) << 3));
                        ++ix;

                        if ((ch2 & 0xC0) != 0x080) {
                            reportInvalidOther(ch2);
                        }
                        ch = (ch << 6) | (ch2 & 0x3F);
                        if (needed > 2) { // 4 bytes? (need surrogates on output)
                            ch2 = quads[ix >> 2];
                            byteIx = (ix & 3);
                            ch2 = (ch2 >> ((3 - byteIx) << 3));
                            ++ix;
                            if ((ch2 & 0xC0) != 0x080) {
                                reportInvalidOther(ch2 & 0xFF);
                            }
                            ch = (ch << 6) | (ch2 & 0x3F);
                        }
                    }
                    ok = XmlChars.is10NameChar(ch);
                    if (needed > 2) { // surrogate pair? once again, let's output one here, one later on
                        ch -= 0x10000; // to normalize it starting with 0x0
                        if (cix >= cbuf.length) {
                            _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
                        }
                        cbuf[cix++] = (char) (0xD800 + (ch >> 10));
                        ch = 0xDC00 | (ch & 0x03FF);
                    }
                }
            }
            if (!ok) {
                reportInvalidNameChar(ch, cix);
            }
            if (cix >= cbuf.length) {
                _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
            }
            cbuf[cix++] = (char) ch;
        }

        /* Ok. Now we have the character array, and can construct the
         * String (as well as check proper composition of semicolons
         * for ns-aware mode...)
         */
        String baseName = new String(cbuf, 0, cix);
        // And finally, unalign if necessary
        if (lastQuadBytes < 4) {
            quads[qlen - 1] = lastQuad;
        }
        return symbols.addSymbol(hash, baseName, last_colon, quads, qlen);
    }