in sdk/serialization/azure-xml/src/main/java/com/azure/xml/implementation/aalto/in/ByteBasedScanner.java [177:395]
protected final PName addUTFPName(ByteBasedPNameTable symbols, XmlCharTypes charTypes, int hash, int[] quads,
int qlen, int lastQuadBytes) throws XMLStreamException {
// 4 bytes per quad, except last one maybe less
int byteLen = (qlen << 2) - 4 + lastQuadBytes;
// And last one is not correctly aligned (leading zero bytes instead
// need to shift a bit, instead of trailing). Only need to shift it
// for UTF-8 decoding; need revert for storage (since key will not
// be aligned, to optimize lookup speed)
int lastQuad;
if (lastQuadBytes < 4) {
lastQuad = quads[qlen - 1];
// 8/16/24 bit left shift
quads[qlen - 1] = (lastQuad << ((4 - lastQuadBytes) << 3));
} else {
lastQuad = 0;
}
// Let's handle first char separately (different validation):
int ch = (quads[0] >>> 24);
boolean ok;
int ix = 1;
char[] cbuf = _nameBuffer;
int cix = 0;
final int[] TYPES = charTypes.NAME_CHARS;
switch (TYPES[ch]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_NAME_COLON: // not ok as first
case XmlCharTypes.CT_NAME_NONFIRST:
case InputCharTypes.CT_INPUT_NAME_MB_N:
ok = false;
break;
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
default: // multi-byte (UTF-8) chars:
{
int needed;
if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
ch &= 0x1F;
needed = 1;
} else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
ch &= 0x0F;
needed = 2;
} else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
ch &= 0x07;
needed = 3;
} else { // 5- and 6-byte chars not valid xml chars
reportInvalidInitial(ch);
needed = ch = 1; // never really gets this far
}
if ((ix + needed) > byteLen) {
reportEofInName();
}
ix += needed;
int q = quads[0];
// Always need at least one more right away:
int ch2 = (q >> 16) & 0xFF;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
/* And then may need more. Note: here we do not do all the
* checks that UTF-8 text decoder might do. Reason is that
* name validity checking methods handle most of such checks
*/
if (needed > 1) {
ch2 = (q >> 8) & 0xFF;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
ch2 = q & 0xFF;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2 & 0xFF);
}
ch = (ch << 6) | (ch2 & 0x3F);
}
}
ok = XmlChars.is10NameStartChar(ch);
if (needed > 2) { // outside of basic 16-bit range? need surrogates
/* so, let's first output first char (high surrogate),
* let second be output by later code
*/
ch -= 0x10000; // to normalize it starting with 0x0
cbuf[cix++] = (char) (0xD800 + (ch >> 10));
ch = (0xDC00 | (ch & 0x03FF));
}
}
}
if (!ok) { // 0 to indicate it's first char, even with surrogates
reportInvalidNameChar(ch, 0);
}
cbuf[cix++] = (char) ch; // the only char, or second (low) surrogate
/* Whoa! Tons of code for just the start char. But now we get to
* decode the name proper, at last!
*/
int last_colon = -1;
while (ix < byteLen) {
ch = quads[ix >> 2]; // current quad, need to shift+mask
int byteIx = (ix & 3);
ch = (ch >> ((3 - byteIx) << 3)) & 0xFF;
++ix;
// Ascii?
switch (TYPES[ch]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_MULTIBYTE_N:
ok = false;
break;
case XmlCharTypes.CT_NAME_COLON: // not ok as first
if (last_colon >= 0) {
reportMultipleColonsInName();
}
last_colon = cix;
ok = true;
break;
case XmlCharTypes.CT_NAME_NONFIRST:
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
default: {
int needed;
if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
ch &= 0x1F;
needed = 1;
} else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
ch &= 0x0F;
needed = 2;
} else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
ch &= 0x07;
needed = 3;
} else { // 5- and 6-byte chars not valid xml chars
reportInvalidInitial(ch);
needed = ch = 1; // never really gets this far
}
if ((ix + needed) > byteLen) {
reportEofInName();
}
// Ok, always need at least one more:
int ch2 = quads[ix >> 2]; // current quad, need to shift+mask
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
++ix;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
// Once again, some of validation deferred to name char validator
if (needed > 1) {
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
++ix;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
++ix;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2 & 0xFF);
}
ch = (ch << 6) | (ch2 & 0x3F);
}
}
ok = XmlChars.is10NameChar(ch);
if (needed > 2) { // surrogate pair? once again, let's output one here, one later on
ch -= 0x10000; // to normalize it starting with 0x0
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = (char) (0xD800 + (ch >> 10));
ch = 0xDC00 | (ch & 0x03FF);
}
}
}
if (!ok) {
reportInvalidNameChar(ch, cix);
}
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = (char) ch;
}
/* Ok. Now we have the character array, and can construct the
* String (as well as check proper composition of semicolons
* for ns-aware mode...)
*/
String baseName = new String(cbuf, 0, cix);
// And finally, unalign if necessary
if (lastQuadBytes < 4) {
quads[qlen - 1] = lastQuad;
}
return symbols.addSymbol(hash, baseName, last_colon, quads, qlen);
}