in MiniEngine/Model/json.hpp [6454:7038]
token_type scan_string()
{
// reset token_buffer (ignore opening quote)
reset();
// we entered the function by reading an open quote
JSON_ASSERT(current == '\"');
while (true)
{
// get next character
switch (get())
{
// end of file while parsing string
case std::char_traits<char_type>::eof():
{
error_message = "invalid string: missing closing quote";
return token_type::parse_error;
}
// closing quote
case '\"':
{
return token_type::value_string;
}
// escapes
case '\\':
{
switch (get())
{
// quotation mark
case '\"':
add('\"');
break;
// reverse solidus
case '\\':
add('\\');
break;
// solidus
case '/':
add('/');
break;
// backspace
case 'b':
add('\b');
break;
// form feed
case 'f':
add('\f');
break;
// line feed
case 'n':
add('\n');
break;
// carriage return
case 'r':
add('\r');
break;
// tab
case 't':
add('\t');
break;
// unicode escapes
case 'u':
{
const int codepoint1 = get_codepoint();
int codepoint = codepoint1; // start with codepoint1
if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
{
error_message = "invalid string: '\\u' must be followed by 4 hex digits";
return token_type::parse_error;
}
// check if code point is a high surrogate
if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
{
// expect next \uxxxx entry
if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
{
const int codepoint2 = get_codepoint();
if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
{
error_message = "invalid string: '\\u' must be followed by 4 hex digits";
return token_type::parse_error;
}
// check if codepoint2 is a low surrogate
if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
{
// overwrite codepoint
codepoint = static_cast<int>(
// high surrogate occupies the most significant 22 bits
(static_cast<unsigned int>(codepoint1) << 10u)
// low surrogate occupies the least significant 15 bits
+ static_cast<unsigned int>(codepoint2)
// there is still the 0xD800, 0xDC00 and 0x10000 noise
// in the result so we have to subtract with:
// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
- 0x35FDC00u);
}
else
{
error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
return token_type::parse_error;
}
}
else
{
error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
return token_type::parse_error;
}
}
else
{
if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
{
error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
return token_type::parse_error;
}
}
// result of the above calculation yields a proper codepoint
JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
// translate codepoint into bytes
if (codepoint < 0x80)
{
// 1-byte characters: 0xxxxxxx (ASCII)
add(static_cast<char_int_type>(codepoint));
}
else if (codepoint <= 0x7FF)
{
// 2-byte characters: 110xxxxx 10xxxxxx
add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
}
else if (codepoint <= 0xFFFF)
{
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
}
else
{
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
}
break;
}
// other characters after escape
default:
error_message = "invalid string: forbidden character after backslash";
return token_type::parse_error;
}
break;
}
// invalid control characters
case 0x00:
{
error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
return token_type::parse_error;
}
case 0x01:
{
error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
return token_type::parse_error;
}
case 0x02:
{
error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
return token_type::parse_error;
}
case 0x03:
{
error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
return token_type::parse_error;
}
case 0x04:
{
error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
return token_type::parse_error;
}
case 0x05:
{
error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
return token_type::parse_error;
}
case 0x06:
{
error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
return token_type::parse_error;
}
case 0x07:
{
error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
return token_type::parse_error;
}
case 0x08:
{
error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
return token_type::parse_error;
}
case 0x09:
{
error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
return token_type::parse_error;
}
case 0x0A:
{
error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
return token_type::parse_error;
}
case 0x0B:
{
error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
return token_type::parse_error;
}
case 0x0C:
{
error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
return token_type::parse_error;
}
case 0x0D:
{
error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
return token_type::parse_error;
}
case 0x0E:
{
error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
return token_type::parse_error;
}
case 0x0F:
{
error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
return token_type::parse_error;
}
case 0x10:
{
error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
return token_type::parse_error;
}
case 0x11:
{
error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
return token_type::parse_error;
}
case 0x12:
{
error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
return token_type::parse_error;
}
case 0x13:
{
error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
return token_type::parse_error;
}
case 0x14:
{
error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
return token_type::parse_error;
}
case 0x15:
{
error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
return token_type::parse_error;
}
case 0x16:
{
error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
return token_type::parse_error;
}
case 0x17:
{
error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
return token_type::parse_error;
}
case 0x18:
{
error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
return token_type::parse_error;
}
case 0x19:
{
error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
return token_type::parse_error;
}
case 0x1A:
{
error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
return token_type::parse_error;
}
case 0x1B:
{
error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
return token_type::parse_error;
}
case 0x1C:
{
error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
return token_type::parse_error;
}
case 0x1D:
{
error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
return token_type::parse_error;
}
case 0x1E:
{
error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
return token_type::parse_error;
}
case 0x1F:
{
error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
return token_type::parse_error;
}
// U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
case 0x20:
case 0x21:
case 0x23:
case 0x24:
case 0x25:
case 0x26:
case 0x27:
case 0x28:
case 0x29:
case 0x2A:
case 0x2B:
case 0x2C:
case 0x2D:
case 0x2E:
case 0x2F:
case 0x30:
case 0x31:
case 0x32:
case 0x33:
case 0x34:
case 0x35:
case 0x36:
case 0x37:
case 0x38:
case 0x39:
case 0x3A:
case 0x3B:
case 0x3C:
case 0x3D:
case 0x3E:
case 0x3F:
case 0x40:
case 0x41:
case 0x42:
case 0x43:
case 0x44:
case 0x45:
case 0x46:
case 0x47:
case 0x48:
case 0x49:
case 0x4A:
case 0x4B:
case 0x4C:
case 0x4D:
case 0x4E:
case 0x4F:
case 0x50:
case 0x51:
case 0x52:
case 0x53:
case 0x54:
case 0x55:
case 0x56:
case 0x57:
case 0x58:
case 0x59:
case 0x5A:
case 0x5B:
case 0x5D:
case 0x5E:
case 0x5F:
case 0x60:
case 0x61:
case 0x62:
case 0x63:
case 0x64:
case 0x65:
case 0x66:
case 0x67:
case 0x68:
case 0x69:
case 0x6A:
case 0x6B:
case 0x6C:
case 0x6D:
case 0x6E:
case 0x6F:
case 0x70:
case 0x71:
case 0x72:
case 0x73:
case 0x74:
case 0x75:
case 0x76:
case 0x77:
case 0x78:
case 0x79:
case 0x7A:
case 0x7B:
case 0x7C:
case 0x7D:
case 0x7E:
case 0x7F:
{
add(current);
break;
}
// U+0080..U+07FF: bytes C2..DF 80..BF
case 0xC2:
case 0xC3:
case 0xC4:
case 0xC5:
case 0xC6:
case 0xC7:
case 0xC8:
case 0xC9:
case 0xCA:
case 0xCB:
case 0xCC:
case 0xCD:
case 0xCE:
case 0xCF:
case 0xD0:
case 0xD1:
case 0xD2:
case 0xD3:
case 0xD4:
case 0xD5:
case 0xD6:
case 0xD7:
case 0xD8:
case 0xD9:
case 0xDA:
case 0xDB:
case 0xDC:
case 0xDD:
case 0xDE:
case 0xDF:
{
if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
{
return token_type::parse_error;
}
break;
}
// U+0800..U+0FFF: bytes E0 A0..BF 80..BF
case 0xE0:
{
if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
{
return token_type::parse_error;
}
break;
}
// U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
// U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
case 0xE1:
case 0xE2:
case 0xE3:
case 0xE4:
case 0xE5:
case 0xE6:
case 0xE7:
case 0xE8:
case 0xE9:
case 0xEA:
case 0xEB:
case 0xEC:
case 0xEE:
case 0xEF:
{
if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
{
return token_type::parse_error;
}
break;
}
// U+D000..U+D7FF: bytes ED 80..9F 80..BF
case 0xED:
{
if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
{
return token_type::parse_error;
}
break;
}
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
case 0xF0:
{
if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
{
return token_type::parse_error;
}
break;
}
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
case 0xF1:
case 0xF2:
case 0xF3:
{
if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
{
return token_type::parse_error;
}
break;
}
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
case 0xF4:
{
if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
{
return token_type::parse_error;
}
break;
}
// remaining bytes (80..C1 and F5..FF) are ill-formed
default:
{
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error;
}
}
}
}