in lib/yamlcpp/src/stream.cpp [336:400]
void Stream::StreamInUtf16() const {
unsigned long ch = 0;
unsigned char bytes[2];
int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
if (!m_input.good()) {
return;
}
ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
if (ch >= 0xDC00 && ch < 0xE000) {
// Trailing (low) surrogate...ugh, wrong order
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
return;
}
if (ch >= 0xD800 && ch < 0xDC00) {
// ch is a leading (high) surrogate
// Four byte UTF-8 code point
// Read the trailing (low) surrogate
for (;;) {
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
if (!m_input.good()) {
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
return;
}
unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
if (chLow < 0xDC00 || chLow >= 0xE000) {
// Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
// stream.
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
// Deal with the next UTF-16 unit
if (chLow < 0xD800 || chLow >= 0xE000) {
// Easiest case: queue the codepoint and return
QueueUnicodeCodepoint(m_readahead, ch);
return;
}
// Start the loop over with the new high surrogate
ch = chLow;
continue;
}
// Select the payload bits from the high surrogate
ch &= 0x3FF;
ch <<= 10;
// Include bits from low surrogate
ch |= (chLow & 0x3FF);
// Add the surrogacy offset
ch += 0x10000;
break;
}
}
QueueUnicodeCodepoint(m_readahead, ch);
}